AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Song, T.; Liu, Z.; Zhao, R.; Fu, J.
ElderEase AR: Enhancing Elderly Daily Living with the Multimodal Large Language Model and Augmented Reality Proceedings Article
In: ICVRT - Proc. Int. Conf. Virtual Real. Technol., pp. 60–67, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071018-6 (ISBN).
Abstract | Links | BibTeX | Tags: Age-related, Assisted living, Augmented Reality, Augmented reality technology, Daily Life Support, Daily living, Daily-life supports, Elderly, Elderly users, Independent living, Independent living systems, Language Model, Modeling languages, Multi agent systems, Multi-modal, Multimodal large language model
@inproceedings{song_elderease_2025,
title = {ElderEase AR: Enhancing Elderly Daily Living with the Multimodal Large Language Model and Augmented Reality},
author = {T. Song and Z. Liu and R. Zhao and J. Fu},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001924899&doi=10.1145%2f3711496.3711505&partnerID=40&md5=4df693735547b505172657a73359f3ca},
doi = {10.1145/3711496.3711505},
isbn = {979-840071018-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {ICVRT - Proc. Int. Conf. Virtual Real. Technol.},
pages = {60–67},
publisher = {Association for Computing Machinery, Inc},
abstract = {Elderly individuals often face challenges in independent living due to age-related cognitive and physical decline. To address these issues, we propose an innovative Augmented Reality (AR) system, “ElderEase AR”, designed to assist elderly users in their daily lives by leveraging a Multimodal Large Language Model (MLLM). This system enables elderly users to capture images of their surroundings and ask related questions, providing context-aware feedback. We evaluated the system’s perceived ease-of-use and feasibility through a pilot study involving 30 elderly users, aiming to enhance their independence and quality of life. Our system integrates advanced AR technology with an intelligent agent trained on multimodal datasets. Through prompt engineering, the agent is tailored to respond in a manner that aligns with the speaking style of elderly users. Experimental results demonstrate high accuracy in object recognition and question answering, with positive feedback from user trials. Specifically, the system accurately identified objects in various environments and provided relevant answers to user queries. This study highlights the powerful potential of AR and AI technologies in creating support tools for the elderly. It suggests directions for future improvements and applications, such as enhancing the system’s adaptability to different user needs and expanding its functionality to cover more aspects of daily living. © 2024 Copyright held by the owner/author(s).},
keywords = {Age-related, Assisted living, Augmented Reality, Augmented reality technology, Daily Life Support, Daily living, Daily-life supports, Elderly, Elderly users, Independent living, Independent living systems, Language Model, Modeling languages, Multi agent systems, Multi-modal, Multimodal large language model},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
Villalobos, W.; Kumar, Y.; Li, J. J.
The Multilingual Eyes Multimodal Traveler’s App Proceedings Article
In: X.-S., Yang; S., Sherratt; N., Dey; A., Joshi (Ed.): Lect. Notes Networks Syst., pp. 565–575, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 23673370 (ISSN); 978-981973304-0 (ISBN).
Abstract | Links | BibTeX | Tags: AI in travel, Artificial intelligence in travel, Assistive navigation technologies, Assistive navigation technology, Assistive navigations, Human-AI interaction in tourism, Human-artificial intelligence interaction in tourism, Language Model, Military applications, Military operations, Multi-modal, Multilingual translations, Multimodal large language model, Multimodal LLMs, Navigation technology, Real- time, Real-time multilingual translation, Robots, Virtual Reality
@inproceedings{villalobos_multilingual_2024,
title = {The Multilingual Eyes Multimodal Traveler’s App},
author = {W. Villalobos and Y. Kumar and J. J. Li},
editor = {Yang X.-S. and Sherratt S. and Dey N. and Joshi A.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85201104509&doi=10.1007%2f978-981-97-3305-7_45&partnerID=40&md5=91f94aa091c97ec3ad251e07b47fa06e},
doi = {10.1007/978-981-97-3305-7_45},
isbn = {23673370 (ISSN); 978-981973304-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Lect. Notes Networks Syst.},
volume = {1004 LNNS},
pages = {565–575},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {This paper presents an in-depth analysis of “The Multilingual Eyes Multimodal Traveler’s App” (MEMTA), a novel application in the realm of travel technology, leveraging advanced Artificial Intelligence (AI) capabilities. The core of MEMTA’s innovation lies in its integration of multimodal Large Language Models (LLMs), notably ChatGPT-4-Vision, to enhance navigational assistance and situational awareness for tourists and visually impaired individuals in diverse environments. The study rigorously evaluates how the incorporation of OpenAI’s Whisper and DALL-E 3 technologies augments the app’s proficiency in real-time, multilingual translation, pronunciation, and visual content generation, thereby significantly improving the user experience in various geographical settings. A key focus is placed on the development and impact of a custom GPT model, Susanin, designed specifically for the app, highlighting its advancements in Human-AI interaction and accessibility over standard LLMs. The paper thoroughly explores the practical applications of MEMTA, extending its utility beyond mere travel assistance to sectors such as robotics, virtual reality, and military operations, thus underscoring its multifaceted significance. Through this exploration, the study contributes novel insights into the fields of AI-enhanced travel, assistive technologies, and the broader scope of human-AI interaction. © The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2024.},
keywords = {AI in travel, Artificial intelligence in travel, Assistive navigation technologies, Assistive navigation technology, Assistive navigations, Human-AI interaction in tourism, Human-artificial intelligence interaction in tourism, Language Model, Military applications, Military operations, Multi-modal, Multilingual translations, Multimodal large language model, Multimodal LLMs, Navigation technology, Real- time, Real-time multilingual translation, Robots, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
de Oliveira, E. A. Masasi; Silva, D. F. C.; Filho, A. R. G.
Improving VR Accessibility Through Automatic 360 Scene Description Using Multimodal Large Language Models Proceedings Article
In: ACM Int. Conf. Proc. Ser., pp. 289–293, Association for Computing Machinery, 2024, ISBN: 979-840070979-1 (ISBN).
Abstract | Links | BibTeX | Tags: 3D Scene, 3D scenes, Accessibility, Computer simulation languages, Descriptive information, Digital elevation model, Immersive, Language Model, Multi-modal, Multimodal large language model, Multimodal Large Language Models (MLLMs), Scene description, Virtual environments, Virtual Reality, Virtual Reality (VR), Virtual reality technology
@inproceedings{masasi_de_oliveira_improving_2024,
title = {Improving VR Accessibility Through Automatic 360 Scene Description Using Multimodal Large Language Models},
author = {E. A. Masasi de Oliveira and D. F. C. Silva and A. R. G. Filho},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85206580797&doi=10.1145%2f3691573.3691619&partnerID=40&md5=6e80800fce0e6b56679fbcbe982bcfa7},
doi = {10.1145/3691573.3691619},
isbn = {979-840070979-1 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {ACM Int. Conf. Proc. Ser.},
pages = {289–293},
publisher = {Association for Computing Machinery},
abstract = {Advancements in Virtual Reality (VR) technology hold immense promise for enriching immersive experiences. Despite the advancements in VR technology, there remains a significant gap in addressing accessibility concerns, particularly in automatically providing descriptive information for VR scenes. This paper combines the potential of leveraging Multimodal Large Language Models (MLLMs) to automatically generate text descriptions for 360 VR scenes according to Speech-to-Text (STT) prompts. As a case study, we conduct experiments on educational settings in VR museums, improving dynamic experiences across various contexts. Despite minor challenges in adapting MLLMs to VR Scenes, the experiments demonstrate that they can generate descriptions with high quality. Our findings provide insights for enhancing VR experiences and ensuring accessibility to individuals with disabilities or diverse needs. © 2024 Copyright held by the owner/author(s).},
keywords = {3D Scene, 3D scenes, Accessibility, Computer simulation languages, Descriptive information, Digital elevation model, Immersive, Language Model, Multi-modal, Multimodal large language model, Multimodal Large Language Models (MLLMs), Scene description, Virtual environments, Virtual Reality, Virtual Reality (VR), Virtual reality technology},
pubstate = {published},
tppubtype = {inproceedings}
}