AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Song, T.; Pabst, F.; Eck, U.; Navab, N.
Enhancing Patient Acceptance of Robotic Ultrasound through Conversational Virtual Agent and Immersive Visualizations Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2901–2911, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: 3D reconstruction, adult, Augmented Reality, Computer graphics, computer interface, echography, female, human, Humans, Imaging, Intelligent robots, Intelligent virtual agents, Language Model, male, Medical robotics, Middle Aged, Mixed reality, Patient Acceptance of Health Care, patient attitude, Patient comfort, procedures, Real-world, Reality visualization, Robotic Ultrasound, Robotics, Three-Dimensional, three-dimensional imaging, Trust and Acceptance, Ultrasonic applications, Ultrasonic equipment, Ultrasonography, Ultrasound probes, User-Computer Interface, Virtual agent, Virtual assistants, Virtual environments, Virtual Reality, Visual languages, Visualization, Young Adult
@article{song_enhancing_2025,
title = {Enhancing Patient Acceptance of Robotic Ultrasound through Conversational Virtual Agent and Immersive Visualizations},
author = {T. Song and F. Pabst and U. Eck and N. Navab},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003687673&doi=10.1109%2fTVCG.2025.3549181&partnerID=40&md5=1d46569933582ecf5e967f0794aafc07},
doi = {10.1109/TVCG.2025.3549181},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2901–2911},
abstract = {Robotic ultrasound systems have the potential to improve medical diagnostics, but patient acceptance remains a key challenge. To address this, we propose a novel system that combines an AI-based virtual agent, powered by a large language model (LLM), with three mixed reality visualizations aimed at enhancing patient comfort and trust. The LLM enables the virtual assistant to engage in natural, conversational dialogue with patients, answering questions in any format and offering real-time reassurance, creating a more intelligent and reliable interaction. The virtual assistant is animated as controlling the ultrasound probe, giving the impression that the robot is guided by the assistant. The first visualization employs augmented reality (AR), allowing patients to see the real world and the robot with the virtual avatar superimposed. The second visualization is an augmented virtuality (AV) environment, where the real-world body part being scanned is visible, while a 3D Gaussian Splatting reconstruction of the room, excluding the robot, forms the virtual environment. The third is a fully immersive virtual reality (VR) experience, featuring the same 3D reconstruction but entirely virtual, where the patient sees a virtual representation of their body being scanned in a robot-free environment. In this case, the virtual ultrasound probe, mirrors the movement of the probe controlled by the robot, creating a synchronized experience as it touches and moves over the patient's virtual body. We conducted a comprehensive agent-guided robotic ultrasound study with all participants, comparing these visualizations against a standard robotic ultrasound procedure. Results showed significant improvements in patient trust, acceptance, and comfort. Based on these findings, we offer insights into designing future mixed reality visualizations and virtual agents to further enhance patient comfort and acceptance in autonomous medical procedures. © 1995-2012 IEEE.},
keywords = {3D reconstruction, adult, Augmented Reality, Computer graphics, computer interface, echography, female, human, Humans, Imaging, Intelligent robots, Intelligent virtual agents, Language Model, male, Medical robotics, Middle Aged, Mixed reality, Patient Acceptance of Health Care, patient attitude, Patient comfort, procedures, Real-world, Reality visualization, Robotic Ultrasound, Robotics, Three-Dimensional, three-dimensional imaging, Trust and Acceptance, Ultrasonic applications, Ultrasonic equipment, Ultrasonography, Ultrasound probes, User-Computer Interface, Virtual agent, Virtual assistants, Virtual environments, Virtual Reality, Visual languages, Visualization, Young Adult},
pubstate = {published},
tppubtype = {article}
}
Coronado, A.; Carvalho, S. T.; Berretta, L.
See Through My Eyes: Using Multimodal Large Language Model for Describing Rendered Environments to Blind People Proceedings Article
In: IMX - Proc. ACM Int. Conf. Interact. Media Experiences, pp. 451–457, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071391-0 (ISBN).
Abstract | Links | BibTeX | Tags: Accessibility, Behavioral Research, Blind, Blind people, Helmet mounted displays, Human engineering, Human rehabilitation equipment, Interactive computer graphics, Interactive computer systems, Language Model, LLM, Multi-modal, Rendered environment, rendered environments, Spatial cognition, Virtual Reality, Vision aids, Visual impairment, Visual languages, Visually impaired people
@inproceedings{coronado_see_2025,
title = {See Through My Eyes: Using Multimodal Large Language Model for Describing Rendered Environments to Blind People},
author = {A. Coronado and S. T. Carvalho and L. Berretta},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007991842&doi=10.1145%2f3706370.3731641&partnerID=40&md5=2f7cb1535d39d5e59b1f43f773de3272},
doi = {10.1145/3706370.3731641},
isbn = {979-840071391-0 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {IMX - Proc. ACM Int. Conf. Interact. Media Experiences},
pages = {451–457},
publisher = {Association for Computing Machinery, Inc},
abstract = {Extended Reality (XR) is quickly expanding "as the next major technology wave in personal computing". Nevertheless, this expansion and adoption could also exclude certain disabled users, particularly people with visual impairment (VIP). According to the World Health Organization (WHO) in their 2019 publication, there were at least 2.2 billion people with visual impairment, a number that is also estimated to have increased in recent years. Therefore, it is important to include disabled users, especially visually impaired people, in the design of Head-Mounted Displays and Extended Reality environments. Indeed, this objective can be pursued by incorporating Multimodal Large Language Model (MLLM) technology, which can assist visually impaired people. As a case study, this study employs different prompts that result in environment descriptions from an MLLM integrated into a virtual reality (VR) escape room. Therefore, six potential prompts were engineered to generate valuable outputs for visually impaired users inside a VR environment. These outputs were evaluated using the G-Eval, and VIEScore metrics. Even though, the results show that the prompt patterns provided a description that aligns with the user's point of view, it is highly recommended to evaluate these outputs through "expected outputs"from Orientation and Mobility Specialists, and Sighted Guides. Furthermore, the subsequent step in the process is to evaluate these outputs by visually impaired people themselves to identify the most effective prompt pattern. © 2025 Copyright held by the owner/author(s).},
keywords = {Accessibility, Behavioral Research, Blind, Blind people, Helmet mounted displays, Human engineering, Human rehabilitation equipment, Interactive computer graphics, Interactive computer systems, Language Model, LLM, Multi-modal, Rendered environment, rendered environments, Spatial cognition, Virtual Reality, Vision aids, Visual impairment, Visual languages, Visually impaired people},
pubstate = {published},
tppubtype = {inproceedings}
}
Behravan, M.; Matković, K.; Gračanin, D.
Generative AI for Context-Aware 3D Object Creation Using Vision-Language Models in Augmented Reality Proceedings Article
In: Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR, pp. 73–81, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833152157-8 (ISBN).
Abstract | Links | BibTeX | Tags: 3D object, 3D Object Generation, Artificial intelligence systems, Augmented Reality, Capture images, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Generative model, Language Model, Object creation, Vision language model, vision language models, Visual languages
@inproceedings{behravan_generative_2025,
title = {Generative AI for Context-Aware 3D Object Creation Using Vision-Language Models in Augmented Reality},
author = {M. Behravan and K. Matković and D. Gračanin},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105000292700&doi=10.1109%2fAIxVR63409.2025.00018&partnerID=40&md5=b40fa769a6b427918c3fcd86f7c52a75},
doi = {10.1109/AIxVR63409.2025.00018},
isbn = {979-833152157-8 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR},
pages = {73–81},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {We present a novel Artificial Intelligence (AI) system that functions as a designer assistant in augmented reality (AR) environments. Leveraging Vision Language Models (VLMs) like LLaVA and advanced text-to-3D generative models, users can capture images of their surroundings with an Augmented Reality (AR) headset. The system analyzes these images to recommend contextually relevant objects that enhance both functionality and visual appeal. The recommended objects are generated as 3D models and seamlessly integrated into the AR environment for interactive use. Our system utilizes open-source AI models running on local systems to enhance data security and reduce operational costs. Key features include context-aware object suggestions, optimal placement guidance, aesthetic matching, and an intuitive user interface for real-time interaction. Evaluations using the COCO 2017 dataset and real-world AR testing demonstrated high accuracy in object detection and contextual fit rating of 4.1 out of 5. By addressing the challenge of providing context-aware object recommendations in AR, our system expands the capabilities of AI applications in this domain. It enables users to create personalized digital spaces efficiently, leveraging AI for contextually relevant suggestions. © 2025 IEEE.},
keywords = {3D object, 3D Object Generation, Artificial intelligence systems, Augmented Reality, Capture images, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Generative model, Language Model, Object creation, Vision language model, vision language models, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Oliveira, E. A. Masasi De; Sousa, R. T.; Bastos, A. A.; Cintra, L. Martins De Freitas; Filho, A. R. G.
Immersive Virtual Museums with Spatially-Aware Retrieval-Augmented Generation Proceedings Article
In: IMX - Proc. ACM Int. Conf. Interact. Media Experiences, pp. 437–440, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071391-0 (ISBN).
Abstract | Links | BibTeX | Tags: Association reactions, Behavioral Research, Generation systems, Geographics, Human computer interaction, Human engineering, Immersive, Information Retrieval, Interactive computer graphics, Language Model, Large language model, large language models, Museums, Retrieval-Augmented Generation, Search engines, Spatially aware, User interfaces, Virtual environments, Virtual museum, Virtual museum., Virtual Reality, Visual Attention, Visual languages
@inproceedings{masasi_de_oliveira_immersive_2025,
title = {Immersive Virtual Museums with Spatially-Aware Retrieval-Augmented Generation},
author = {E. A. Masasi De Oliveira and R. T. Sousa and A. A. Bastos and L. Martins De Freitas Cintra and A. R. G. Filho},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007979183&doi=10.1145%2f3706370.3731643&partnerID=40&md5=db10b41217dd8a0b0705c3fb4a615666},
doi = {10.1145/3706370.3731643},
isbn = {979-840071391-0 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {IMX - Proc. ACM Int. Conf. Interact. Media Experiences},
pages = {437–440},
publisher = {Association for Computing Machinery, Inc},
abstract = {Virtual Reality has significantly expanded possibilities for immersive museum experiences, overcoming traditional constraints such as space, preservation, and geographic limitations. However, existing virtual museum platforms typically lack dynamic, personalized, and contextually accurate interactions. To address this, we propose Spatially-Aware Retrieval-Augmented Generation (SA-RAG), an innovative framework integrating visual attention tracking with Retrieval-Augmented Generation systems and advanced Large Language Models. By capturing users' visual attention in real time, SA-RAG dynamically retrieves contextually relevant data, enhancing the accuracy, personalization, and depth of user interactions within immersive virtual environments. The system's effectiveness is initially demonstrated through our preliminary tests within a realistic VR museum implemented using Unreal Engine. Although promising, comprehensive human evaluations involving broader user groups are planned for future studies to rigorously validate SA-RAG's effectiveness, educational enrichment potential, and accessibility improvements in virtual museums. The framework also presents opportunities for broader applications in immersive educational and storytelling domains. © 2025 Copyright held by the owner/author(s).},
keywords = {Association reactions, Behavioral Research, Generation systems, Geographics, Human computer interaction, Human engineering, Immersive, Information Retrieval, Interactive computer graphics, Language Model, Large language model, large language models, Museums, Retrieval-Augmented Generation, Search engines, Spatially aware, User interfaces, Virtual environments, Virtual museum, Virtual museum., Virtual Reality, Visual Attention, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhang, G.; Wang, Y.; Luo, C.; Xu, S.; Ming, Y.; Peng, J.; Zhang, M.
Visual Harmony: LLM’s Power in Crafting Coherent Indoor Scenes from Images Proceedings Article
In: Z., Lin; H., Zha; M.-M., Cheng; R., He; C.-L., Liu; K., Ubul; W., Silamu; J., Zhou (Ed.): Lect. Notes Comput. Sci., pp. 3–17, Springer Science and Business Media Deutschland GmbH, 2025, ISBN: 03029743 (ISSN); 978-981978507-0 (ISBN).
Abstract | Links | BibTeX | Tags: Augmented Reality, Depth perception, Indoor scene generation, Input image, Language Model, Large language model, Metaverses, Point-clouds, Power, Scene completion, Scene Generation, Scene-graphs, Semantic Segmentation, Semantics, Virtual Reality, Visual languages
@inproceedings{zhang_visual_2025,
title = {Visual Harmony: LLM’s Power in Crafting Coherent Indoor Scenes from Images},
author = {G. Zhang and Y. Wang and C. Luo and S. Xu and Y. Ming and J. Peng and M. Zhang},
editor = {Lin Z. and Zha H. and Cheng M.-M. and He R. and Liu C.-L. and Ubul K. and Silamu W. and Zhou J.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85209374797&doi=10.1007%2f978-981-97-8508-7_1&partnerID=40&md5=5231ab0bce95fb3f09db80392acd58ff},
doi = {10.1007/978-981-97-8508-7_1},
isbn = {03029743 (ISSN); 978-981978507-0 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {15036 LNCS},
pages = {3–17},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {Indoor scene generation has recently attracted significant attention as it is crucial for metaverse, 3D animation, visual effects in movies, and virtual/augmented reality. Existing image-based indoor scene generation methods often produce scenes that are not realistic enough, with issues such as floating objects, incorrect object orientations, and incomplete scenes that only include the part of the scenes captured by the input image. To address these challenges, we propose Visual Harmony, a method that leverages the powerful spatial imagination capabilities of Large Language Model (LLM) to generate corresponding indoor scenes based on the input image. Specifically, we first extract information from the input image through depth estimation and panorama segmentation, reconstructing a semantic point cloud. Using this reconstructed semantic point cloud, we extract a scene graph that describes only the objects in the image. Then we leverage the strong spatial imagination capabilities of LLM to complete the scene graph, forming a representation of a complete room scene. Based on this fine scene graph, we can generate entire indoor scene that includes both the captured and not captured parts of the input image. Extensive experiments demonstrate that our method can generate realistic, plausible, and highly relevant complete indoor scenes related to the input image. © The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2025.},
keywords = {Augmented Reality, Depth perception, Indoor scene generation, Input image, Language Model, Large language model, Metaverses, Point-clouds, Power, Scene completion, Scene Generation, Scene-graphs, Semantic Segmentation, Semantics, Virtual Reality, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Kim, Y.; Aamir, Z.; Singh, M.; Boorboor, S.; Mueller, K.; Kaufman, A. E.
Explainable XR: Understanding User Behaviors of XR Environments Using LLM-Assisted Analytics Framework Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2756–2766, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: adult, Agnostic, Article, Assistive, Cross Reality, Data Analytics, Data collection, data interpretation, Data recording, Data visualization, Extended reality, human, Language Model, Large language model, large language models, Multi-modal, Multimodal Data Collection, normal human, Personalized assistive technique, Personalized Assistive Techniques, recorder, Spatio-temporal data, therapy, user behavior, User behaviors, Virtual addresses, Virtual environments, Virtual Reality, Visual analytics, Visual languages
@article{kim_explainable_2025,
title = {Explainable XR: Understanding User Behaviors of XR Environments Using LLM-Assisted Analytics Framework},
author = {Y. Kim and Z. Aamir and M. Singh and S. Boorboor and K. Mueller and A. E. Kaufman},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003815583&doi=10.1109%2fTVCG.2025.3549537&partnerID=40&md5=1085b698db06656985f80418cb37b773},
doi = {10.1109/TVCG.2025.3549537},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2756–2766},
abstract = {We present Explainable XR, an end-to-end framework for analyzing user behavior in diverse eXtended Reality (XR) environments by leveraging Large Language Models (LLMs) for data interpretation assistance. Existing XR user analytics frameworks face challenges in handling cross-virtuality - AR, VR, MR - transitions, multi-user collaborative application scenarios, and the complexity of multimodal data. Explainable XR addresses these challenges by providing a virtuality-agnostic solution for the collection, analysis, and visualization of immersive sessions. We propose three main components in our framework: (1) A novel user data recording schema, called User Action Descriptor (UAD), that can capture the users' multimodal actions, along with their intents and the contexts; (2) a platform-agnostic XR session recorder, and (3) a visual analytics interface that offers LLM-assisted insights tailored to the analysts' perspectives, facilitating the exploration and analysis of the recorded XR session data. We demonstrate the versatility of Explainable XR by demonstrating five use-case scenarios, in both individual and collaborative XR applications across virtualities. Our technical evaluation and user studies show that Explainable XR provides a highly usable analytics solution for understanding user actions and delivering multifaceted, actionable insights into user behaviors in immersive environments. © 1995-2012 IEEE.},
keywords = {adult, Agnostic, Article, Assistive, Cross Reality, Data Analytics, Data collection, data interpretation, Data recording, Data visualization, Extended reality, human, Language Model, Large language model, large language models, Multi-modal, Multimodal Data Collection, normal human, Personalized assistive technique, Personalized Assistive Techniques, recorder, Spatio-temporal data, therapy, user behavior, User behaviors, Virtual addresses, Virtual environments, Virtual Reality, Visual analytics, Visual languages},
pubstate = {published},
tppubtype = {article}
}
2024
Venkatachalam, N.; Rayana, M.; Vignesh, S. Bala; Prathamesh, S.
Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences Proceedings Article
In: Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT, pp. 1133–1138, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835032753-3 (ISBN).
Abstract | Links | BibTeX | Tags: Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets
@inproceedings{venkatachalam_voice-driven_2024,
title = {Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences},
author = {N. Venkatachalam and M. Rayana and S. Bala Vignesh and S. Prathamesh},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85190121845&doi=10.1109%2fIDCIoT59759.2024.10467441&partnerID=40&md5=6594fbab013d9156b79a887f0d7209cb},
doi = {10.1109/IDCIoT59759.2024.10467441},
isbn = {979-835032753-3 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT},
pages = {1133–1138},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {This research study introduces an innovative system that aims to synthesize 360-degree panoramic images in Realtime based on vocal prompts from the user, leveraging state-of-The-Art Generative AI with a combination of advanced NLP models. The primary objective of this system is to transform spoken descriptions into immersive and interactive visual scenes, specifically designed to provide users with first-person field views. This cutting-edge technology has the potential to revolutionize the realm of virtual reality (VR) experiences, enabling users to effortlessly create and navigate through personalized environments. The fundamental goal of this system is to enable the generation of real-Time images that are seamlessly compatible with VR headsets, offering a truly immersive and adaptive visual experience. Beyond its technological advancements, this research also highlights its significant potential for creating a positive social impact. One notable application lies in psychological interventions, particularly in the context of phobia treatment and therapeutic settings. Here, patients can safely confront and work through their fears within these synthesized environments, potentially offering new avenues for therapy. Furthermore, the system serves educational and entertainment purposes by bringing users' imaginations to life, providing an unparalleled platform for exploring the boundaries of virtual experiences. Overall, this research represents a promising stride towards a more immersive and adaptable future in VR technology, with the potential to enhance various aspects of human lives, from mental health treatment to entertainment and education. © 2024 IEEE.},
keywords = {Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets},
pubstate = {published},
tppubtype = {inproceedings}
}
Liang, Q.; Chen, Y.; Li, W.; Lai, M.; Ni, W.; Qiu, H.
In: L., Zhang; W., Yu; Q., Wang; Y., Laili; Y., Liu (Ed.): Commun. Comput. Info. Sci., pp. 12–24, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 18650929 (ISSN); 978-981973947-9 (ISBN).
Abstract | Links | BibTeX | Tags: Augmented Reality, Glass, Identity recognition, Internet of Things, Internet of things technologies, IoT, Language learning, Learning systems, LLM, Object Detection, Objects detection, Open Vocabulary Object Detection, Recognition systems, Semantics, Telephone sets, Translation (languages), Translation systems, Visual languages, Wearable computers, Wearable device, Wearable devices
@inproceedings{liang_iknowisee_2024,
title = {iKnowiSee: AR Glasses with Language Learning Translation System and Identity Recognition System Built Based on Large Pre-trained Models of Language and Vision and Internet of Things Technology},
author = {Q. Liang and Y. Chen and W. Li and M. Lai and W. Ni and H. Qiu},
editor = {Zhang L. and Yu W. and Wang Q. and Laili Y. and Liu Y.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85200663840&doi=10.1007%2f978-981-97-3948-6_2&partnerID=40&md5=a0324ba6108674b1d39a338574269d60},
doi = {10.1007/978-981-97-3948-6_2},
isbn = {18650929 (ISSN); 978-981973947-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Commun. Comput. Info. Sci.},
volume = {2139 CCIS},
pages = {12–24},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {AR glasses used in daily life have made good progress and have some practical value.However, the current design concept of AR glasses is basically to simply port the content of a cell phone and act as a secondary screen for the phone. In contrast, the AR glasses we designed are based on actual situations, focus on real-world interactions, and utilize IoT technology with the aim of enabling users to fully extract and utilize the digital information in their lives. We have created two innovative features, one is a language learning translation system for users to learn foreign languages, which integrates a large language model with an open vocabulary recognition model to fully extract the visual semantic information of the scene; and the other is a social conferencing system, which utilizes the IoT cloud, pipe, edge, and end development to reduce the cost of communication and improve the efficiency of exchanges in social situations. © The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2024.},
keywords = {Augmented Reality, Glass, Identity recognition, Internet of Things, Internet of things technologies, IoT, Language learning, Learning systems, LLM, Object Detection, Objects detection, Open Vocabulary Object Detection, Recognition systems, Semantics, Telephone sets, Translation (languages), Translation systems, Visual languages, Wearable computers, Wearable device, Wearable devices},
pubstate = {published},
tppubtype = {inproceedings}
}
Shabanijou, M.; Sharma, V.; Ray, S.; Lu, R.; Xiong, P.
Large Language Model Empowered Spatio-Visual Queries for Extended Reality Environments Proceedings Article
In: W., Ding; C.-T., Lu; F., Wang; L., Di; K., Wu; J., Huan; R., Nambiar; J., Li; F., Ilievski; R., Baeza-Yates; X., Hu (Ed.): Proc. - IEEE Int. Conf. Big Data, BigData, pp. 5843–5846, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835036248-0 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, Digital elevation model, Emerging applications, Immersive environment, Language Model, Metaverses, Modeling languages, Natural language interfaces, Query languages, spatial data, Spatial queries, Structured Query Language, Technological advances, Users perspective, Virtual environments, Visual languages, Visual query
@inproceedings{shabanijou_large_2024,
title = {Large Language Model Empowered Spatio-Visual Queries for Extended Reality Environments},
author = {M. Shabanijou and V. Sharma and S. Ray and R. Lu and P. Xiong},
editor = {Ding W. and Lu C.-T. and Wang F. and Di L. and Wu K. and Huan J. and Nambiar R. and Li J. and Ilievski F. and Baeza-Yates R. and Hu X.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85218011140&doi=10.1109%2fBigData62323.2024.10825084&partnerID=40&md5=fdd78814b8e19830d1b8ecd4b33b0102},
doi = {10.1109/BigData62323.2024.10825084},
isbn = {979-835036248-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Conf. Big Data, BigData},
pages = {5843–5846},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {With the technological advances in creation and capture of 3D spatial data, new emerging applications are being developed. Digital Twins, metaverse and extended reality (XR) based immersive environments can be enriched by leveraging geocoded 3D spatial data. Unlike 2D spatial queries, queries involving 3D immersive environments need to take the query user's viewpoint into account. Spatio-visual queries return objects that are visible from the user's perspective.In this paper, we propose enhancing 3D spatio-visual queries with large language models (LLM). These kinds of queries allow a user to interact with the visible objects using a natural language interface. We have implemented a proof-of-concept prototype and conducted preliminary evaluation. Our results demonstrate the potential of truly interactive immersive environments. © 2024 IEEE.},
keywords = {3D modeling, Digital elevation model, Emerging applications, Immersive environment, Language Model, Metaverses, Modeling languages, Natural language interfaces, Query languages, spatial data, Spatial queries, Structured Query Language, Technological advances, Users perspective, Virtual environments, Visual languages, Visual query},
pubstate = {published},
tppubtype = {inproceedings}
}
Sonawani, S.; Weigend, F.; Amor, H. B.
SiSCo: Signal Synthesis for Effective Human-Robot Communication Via Large Language Models Proceedings Article
In: IEEE Int Conf Intell Rob Syst, pp. 7107–7114, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 21530858 (ISSN); 979-835037770-5 (ISBN).
Abstract | Links | BibTeX | Tags: Communications channels, Extensive resources, Human engineering, Human Robot Interaction, Human-Robot Collaboration, Human-robot communication, Humans-robot interactions, Industrial robots, Intelligent robots, Language Model, Man machine systems, Microrobots, Robust communication, Signal synthesis, Specialized knowledge, Visual communication, Visual cues, Visual languages
@inproceedings{sonawani_sisco_2024,
title = {SiSCo: Signal Synthesis for Effective Human-Robot Communication Via Large Language Models},
author = {S. Sonawani and F. Weigend and H. B. Amor},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85216466596&doi=10.1109%2fIROS58592.2024.10802561&partnerID=40&md5=ccd14b4f0b5d527b179394dffd4e2c73},
doi = {10.1109/IROS58592.2024.10802561},
isbn = {21530858 (ISSN); 979-835037770-5 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {IEEE Int Conf Intell Rob Syst},
pages = {7107–7114},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Effective human-robot collaboration hinges on robust communication channels, with visual signaling playing a pivotal role due to its intuitive appeal. Yet, the creation of visually intuitive cues often demands extensive resources and specialized knowledge. The emergence of Large Language Models (LLMs) offers promising avenues for enhancing human-robot interactions and revolutionizing the way we generate context-aware visual cues. To this end, we introduce SiSCo-a novel framework that combines the computational power of LLMs with mixed-reality technologies to streamline the creation of visual cues for human-robot collaboration. Our results show that SiSCo improves the efficiency of communication in human-robot teaming tasks, reducing task completion time by approximately 73% and increasing task success rates by 18% compared to baseline natural language signals. Additionally, SiSCo reduces cognitive load for participants by 46%, as measured by the NASA-TLX subscale, and receives above-average user ratings for on-the-fly signals generated for unseen objects. To encourage further development and broader community engagement, we provide full access to SiSCo's implementation and related materials on our GitHub repository.1 © 2024 IEEE.},
keywords = {Communications channels, Extensive resources, Human engineering, Human Robot Interaction, Human-Robot Collaboration, Human-robot communication, Humans-robot interactions, Industrial robots, Intelligent robots, Language Model, Man machine systems, Microrobots, Robust communication, Signal synthesis, Specialized knowledge, Visual communication, Visual cues, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Behravan, M.; Gracanin, D.
Generative Multi-Modal Artificial Intelligence for Dynamic Real-Time Context-Aware Content Creation in Augmented Reality Proceedings Article
In: S.N., Spencer (Ed.): Proc. ACM Symp. Virtual Reality Softw. Technol. VRST, Association for Computing Machinery, 2024, ISBN: 979-840070535-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D object, 3D Object Generation, Augmented Reality, Content creation, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Language Model, Multi-modal, Real- time, Time contexts, Vision language model, vision language models, Visual languages
@inproceedings{behravan_generative_2024,
title = {Generative Multi-Modal Artificial Intelligence for Dynamic Real-Time Context-Aware Content Creation in Augmented Reality},
author = {M. Behravan and D. Gracanin},
editor = {Spencer S.N.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85212524068&doi=10.1145%2f3641825.3689685&partnerID=40&md5=daf8aa8960d9dd4dbdbf67ccb1e7fb83},
doi = {10.1145/3641825.3689685},
isbn = {979-840070535-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. ACM Symp. Virtual Reality Softw. Technol. VRST},
publisher = {Association for Computing Machinery},
abstract = {We introduce a framework that uses generative Artificial Intelligence (AI) for dynamic and context-aware content creation in Augmented Reality (AR). By integrating Vision Language Models (VLMs), our system detects and understands the physical space around the user, recommending contextually relevant objects. These objects are transformed into 3D models using a text-to-3D generative AI techniques, allowing for real-time content inclusion within the AR space. This approach enhances user experience by enabling intuitive customization through spoken commands, while reducing costs and improving accessibility to advanced AR interactions. The framework's vision and language capabilities support the generation of comprehensive and context-specific 3D objects. © 2024 Owner/Author.},
keywords = {3D object, 3D Object Generation, Augmented Reality, Content creation, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Language Model, Multi-modal, Real- time, Time contexts, Vision language model, vision language models, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Xu, S.; Wei, Y.; Zheng, P.; Zhang, J.; Yu, C.
LLM enabled generative collaborative design in a mixed reality environment Journal Article
In: Journal of Manufacturing Systems, vol. 74, pp. 703–715, 2024, ISSN: 02786125 (ISSN).
Abstract | Links | BibTeX | Tags: Collaborative design, Collaborative design process, Communication barriers, Computational Linguistics, design, Design frameworks, generative artificial intelligence, Iterative methods, Language Model, Large language model, Mixed reality, Mixed-reality environment, Multi-modal, Visual languages
@article{xu_llm_2024,
title = {LLM enabled generative collaborative design in a mixed reality environment},
author = {S. Xu and Y. Wei and P. Zheng and J. Zhang and C. Yu},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85192244873&doi=10.1016%2fj.jmsy.2024.04.030&partnerID=40&md5=3f050c429cf5a4120d10a432311f46cb},
doi = {10.1016/j.jmsy.2024.04.030},
issn = {02786125 (ISSN)},
year = {2024},
date = {2024-01-01},
journal = {Journal of Manufacturing Systems},
volume = {74},
pages = {703–715},
abstract = {In the collaborative design process, diverse stakeholder backgrounds often introduce inefficiencies in collaboration, such as delays in design delivery and decreased creativity, primarily due to misunderstandings and communication barriers caused by this diversity. To respond, this study proposes an AI-augmented Multimodal Collaborative Design (AI-MCD) framework. This framework utilizes Large Language Models (LLM) to establish an iterative prompting mechanism that provides professional design prompts for Generative AI (GAI) to generate precise visual schemes. On this basis, the GAI cooperates with Mixed Reality (MR) technology to form an interactive and immersive environment for enabling full participation in the design process. By integrating these technologies, the study aims to help stakeholders form a unified cognition and optimize the traditional collaborative design process. Through a case study involving the development of heart education products for children, the effectiveness of the framework is emphasized, and the practical application and effectiveness of the proposed method innovation are demonstrated. © 2024 The Society of Manufacturing Engineers},
keywords = {Collaborative design, Collaborative design process, Communication barriers, Computational Linguistics, design, Design frameworks, generative artificial intelligence, Iterative methods, Language Model, Large language model, Mixed reality, Mixed-reality environment, Multi-modal, Visual languages},
pubstate = {published},
tppubtype = {article}
}
2023
Le, M. -H.; Chu, C. -B.; Le, K. -D.; Nguyen, T. V.; Tran, M. -T.; Le, T. -N.
VIDES: Virtual Interior Design via Natural Language and Visual Guidance Proceedings Article
In: G., Bruder; A.H., Olivier; A., Cunningham; E.Y., Peng; J., Grubert; I., Williams (Ed.): Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct, pp. 689–694, Institute of Electrical and Electronics Engineers Inc., 2023, ISBN: 979-835032891-2 (ISBN).
Abstract | Links | BibTeX | Tags: Architectural design, Customisation, Cutting edge technology, Design concept, Design systems, Image editing, Image generation, Image generations, Indoor space, Interior Design, Interior designs, Interiors (building), Natural languages, Virtual Reality, Visual guidance, Visual languages
@inproceedings{le_vides_2023,
title = {VIDES: Virtual Interior Design via Natural Language and Visual Guidance},
author = {M. -H. Le and C. -B. Chu and K. -D. Le and T. V. Nguyen and M. -T. Tran and T. -N. Le},
editor = {Bruder G. and Olivier A.H. and Cunningham A. and Peng E.Y. and Grubert J. and Williams I.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85180376943&doi=10.1109%2fISMAR-Adjunct60411.2023.00148&partnerID=40&md5=5ce45d9e97fc5a9fdc31eb7514b3def3},
doi = {10.1109/ISMAR-Adjunct60411.2023.00148},
isbn = {979-835032891-2 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct},
pages = {689–694},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Interior design is crucial in creating aesthetically pleasing and functional indoor spaces. However, developing and editing interior design concepts requires significant time and expertise. We propose Virtual Interior DESign (VIDES) system in response to this challenge. Leveraging cutting-edge technology in generative AI, our system can assist users in generating and editing indoor scene concepts quickly, given user text description and visual guidance. Using both visual guidance and language as the conditional inputs significantly enhances the accuracy and coherence of the generated scenes, resulting in visually appealing designs. Through extensive experimentation, we demonstrate the effectiveness of VIDES in developing new indoor concepts, changing indoor styles, and replacing and removing interior objects. The system successfully captures the essence of users' descriptions while providing flexibility for customization. Consequently, this system can potentially reduce the entry barrier for indoor design, making it more accessible to users with limited technical skills and reducing the time required to create high-quality images. Individuals who have a background in design can now easily communicate their ideas visually and effectively present their design concepts. © 2023 IEEE.},
keywords = {Architectural design, Customisation, Cutting edge technology, Design concept, Design systems, Image editing, Image generation, Image generations, Indoor space, Interior Design, Interior designs, Interiors (building), Natural languages, Virtual Reality, Visual guidance, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}