AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Mereu, J.
Using LLMs to enhance end-user development support in XR Proceedings Article
In: V., Paneva; D., Tetteroo; V., Frau; S., Feger; D., Spano; F., Paterno; S., Sauer; M., Manca (Ed.): CEUR Workshop Proc., CEUR-WS, 2025, ISBN: 16130073 (ISSN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Condition, Configuration, Development support, Development technique, End-User Development, End-Users, Event-condition-action, Event-Condition-Actions, Extended reality, Human computer interaction, Information Systems, Information use, Natural Language, Natural language processing systems, Natural languages, Rule, rules
@inproceedings{mereu_using_2025,
title = {Using LLMs to enhance end-user development support in XR},
author = {J. Mereu},
editor = {Paneva V. and Tetteroo D. and Frau V. and Feger S. and Spano D. and Paterno F. and Sauer S. and Manca M.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105008755984&partnerID=40&md5=bfaaa38c3bee309621426f8f35332107},
isbn = {16130073 (ISSN)},
year = {2025},
date = {2025-01-01},
booktitle = {CEUR Workshop Proc.},
volume = {3978},
publisher = {CEUR-WS},
abstract = {This paper outlines the center stage of my PhD research, which aims to empower non-developer users to create and customize eXtended Reality (XR) environments through End-User Development (EUD) techniques combined with the latest AI tools. In particular, I describe my contributions to the EUD4XR project, detailing both the work completed and the ongoing developments. EUD4XR seeks to support end-users in customizing XR content with the assistance of a Large Language Model (LLM)-based conversational agent. © 2025 Copyright for this paper by its authors.},
keywords = {Artificial intelligence, Condition, Configuration, Development support, Development technique, End-User Development, End-Users, Event-condition-action, Event-Condition-Actions, Extended reality, Human computer interaction, Information Systems, Information use, Natural Language, Natural language processing systems, Natural languages, Rule, rules},
pubstate = {published},
tppubtype = {inproceedings}
}
Kai, W. -H.; Xing, K. -X.
Video-driven musical composition using large language model with memory-augmented state space Journal Article
In: Visual Computer, vol. 41, no. 5, pp. 3345–3357, 2025, ISSN: 01782789 (ISSN).
Abstract | Links | BibTeX | Tags: 'current, Associative storage, Augmented Reality, Augmented state space, Computer simulation languages, Computer system recovery, Distributed computer systems, HTTP, Language Model, Large language model, Long-term video-to-music generation, Mamba, Memory architecture, Memory-augmented, Modeling languages, Music, Musical composition, Natural language processing systems, Object oriented programming, Performance, Problem oriented languages, State space, State-space
@article{kai_video-driven_2025,
title = {Video-driven musical composition using large language model with memory-augmented state space},
author = {W. -H. Kai and K. -X. Xing},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001073242&doi=10.1007%2fs00371-024-03606-w&partnerID=40&md5=7ea24f13614a9a24caf418c37a10bd8c},
doi = {10.1007/s00371-024-03606-w},
issn = {01782789 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {Visual Computer},
volume = {41},
number = {5},
pages = {3345–3357},
abstract = {The current landscape of research leveraging large language models (LLMs) is experiencing a surge. Many works harness the powerful reasoning capabilities of these models to comprehend various modalities, such as text, speech, images, videos, etc. However, the research work on LLms for music inspiration is still in its infancy. To fill the gap in this field and break through the dilemma that LLMs can only understand short videos with limited frames, we propose a large language model with state space for long-term video-to-music generation. To capture long-range dependency and maintaining high performance, while further decrease the computing cost, our overall network includes the Enhanced Video Mamba, which incorporates continuous moving window partitioning and local feature augmentation, and a long-term memory bank that captures and aggregates historical video information to mitigate information loss in long sequences. This framework achieves both subquadratic-time computation and near-linear memory complexity, enabling effective long-term video-to-music generation. We conduct a thorough evaluation of our proposed framework. The experimental results demonstrate that our model achieves or surpasses the performance of the current state-of-the-art models. Our code released on https://github.com/kai211233/S2L2-V2M. © The Author(s), under exclusive licence to Springer-Verlag GmbH Germany, part of Springer Nature 2024.},
keywords = {'current, Associative storage, Augmented Reality, Augmented state space, Computer simulation languages, Computer system recovery, Distributed computer systems, HTTP, Language Model, Large language model, Long-term video-to-music generation, Mamba, Memory architecture, Memory-augmented, Modeling languages, Music, Musical composition, Natural language processing systems, Object oriented programming, Performance, Problem oriented languages, State space, State-space},
pubstate = {published},
tppubtype = {article}
}
Huang, D.; Ge, M.; Xiang, K.; Zhang, X.; Yang, H.
Privacy Preservation of Large Language Models in the Metaverse Era: Research Frontiers, Categorical Comparisons, and Future Directions Proceedings Article
In: Int J Network Manage, John Wiley and Sons Ltd, 2025, ISBN: 10557148 (ISSN).
Abstract | Links | BibTeX | Tags: Adversarial networks, Computational Linguistics, Cryptography, Differential privacies, Excel, Language Model, Large language model, large language models, Life cycle, Metaverse, Metaverses, Natural language processing systems, Natural languages, Privacy preservation, Privacy protection, Research frontiers
@inproceedings{huang_privacy_2025,
title = {Privacy Preservation of Large Language Models in the Metaverse Era: Research Frontiers, Categorical Comparisons, and Future Directions},
author = {D. Huang and M. Ge and K. Xiang and X. Zhang and H. Yang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85199980257&doi=10.1002%2fnem.2292&partnerID=40&md5=2dea1caa1d31aecde3d302a908fb7dd3},
doi = {10.1002/nem.2292},
isbn = {10557148 (ISSN)},
year = {2025},
date = {2025-01-01},
booktitle = {Int J Network Manage},
volume = {35},
publisher = {John Wiley and Sons Ltd},
abstract = {Large language models (LLMs), with their billions to trillions of parameters, excel in natural language processing, machine translation, dialog systems, and text summarization. These capabilities are increasingly pivotal in the metaverse, where they can enhance virtual interactions and environments. However, their extensive use, particularly in the metaverse's immersive platforms, raises significant privacy concerns. This paper analyzes existing privacy issues in LLMs, vital for both traditional and metaverse applications, and examines protection techniques across the entire life cycle of these models, from training to user deployment. We delve into cryptography, embedding layer encoding, differential privacy and its variants, and adversarial networks, highlighting their relevance in the metaverse context. Specifically, we explore technologies like homomorphic encryption and secure multiparty computation, which are essential for metaverse security. Our discussion on Gaussian differential privacy, Renyi differential privacy, Edgeworth accounting, and the generation of adversarial samples and loss functions emphasizes their importance in the metaverse's dynamic and interactive environments. Lastly, the paper discusses the current research status and future challenges in the security of LLMs within and beyond the metaverse, emphasizing urgent problems and potential areas for exploration. © 2024 John Wiley & Sons Ltd.},
keywords = {Adversarial networks, Computational Linguistics, Cryptography, Differential privacies, Excel, Language Model, Large language model, large language models, Life cycle, Metaverse, Metaverses, Natural language processing systems, Natural languages, Privacy preservation, Privacy protection, Research frontiers},
pubstate = {published},
tppubtype = {inproceedings}
}
Guo, P.; Zhang, Q.; Tian, C.; Xue, W.; Feng, X.
Digital Human Techniques for Education Reform Proceedings Article
In: ICETM - Proc. Int. Conf. Educ. Technol. Manag., pp. 173–178, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071746-8 (ISBN).
Abstract | Links | BibTeX | Tags: Augmented Reality, Contrastive Learning, Digital elevation model, Digital human technique, Digital Human Techniques, Digital humans, Education Reform, Education reforms, Educational Technology, Express emotions, Federated learning, Human behaviors, Human form models, Human techniques, Immersive, Innovative technology, Modeling languages, Natural language processing systems, Teachers', Teaching, Virtual environments, Virtual humans
@inproceedings{guo_digital_2025,
title = {Digital Human Techniques for Education Reform},
author = {P. Guo and Q. Zhang and C. Tian and W. Xue and X. Feng},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001671326&doi=10.1145%2f3711403.3711428&partnerID=40&md5=dd96647315af9409d119f68f9cf4e980},
doi = {10.1145/3711403.3711428},
isbn = {979-840071746-8 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {ICETM - Proc. Int. Conf. Educ. Technol. Manag.},
pages = {173–178},
publisher = {Association for Computing Machinery, Inc},
abstract = {The rapid evolution of artificial intelligence, big data, and generative AI models has ushered in significant transformations across various sectors, including education. Digital Human Technique, an innovative technology grounded in advanced computer science and artificial intelligence, is reshaping educational paradigms by enabling virtual humans to simulate human behavior, express emotions, and interact with users. This paper explores the application of Digital Human Technique in education reform, focusing on creating immersive, intelligent classroom experiences that foster meaningful interactions between teachers and students. We define Digital Human Technique and delve into its key technical components such as character modeling and rendering, natural language processing, computer vision, and augmented reality technologies. Our methodology involves analyzing the role of educational digital humans created through these technologies, assessing their impact on educational processes, and examining various application scenarios in educational reform. Results indicate that Digital Human Technique significantly enhances the learning experience by enabling personalized teaching, increasing engagement, and fostering emotional connections. Educational digital humans serve as virtual teachers, interactive learning aids, and facilitators of emotional interaction, effectively addressing the challenges of traditional educational methods. They also promote a deeper understanding of complex concepts through simulated environments and interactive digital content. © 2024 Copyright held by the owner/author(s).},
keywords = {Augmented Reality, Contrastive Learning, Digital elevation model, Digital human technique, Digital Human Techniques, Digital humans, Education Reform, Education reforms, Educational Technology, Express emotions, Federated learning, Human behaviors, Human form models, Human techniques, Immersive, Innovative technology, Modeling languages, Natural language processing systems, Teachers', Teaching, Virtual environments, Virtual humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, J.; Wu, X.; Lan, T.; Li, B.
LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2715–2724, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: % reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality
@article{chen_llmer_2025,
title = {LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models},
author = {J. Chen and X. Wu and T. Lan and B. Li},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003825793&doi=10.1109%2fTVCG.2025.3549549&partnerID=40&md5=da4681d0714548e3a7e0c8c3295d2348},
doi = {10.1109/TVCG.2025.3549549},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2715–2724},
abstract = {The integration of Large Language Models (LLMs) like GPT-4 with Extended Reality (XR) technologies offers the potential to build truly immersive XR environments that interact with human users through natural language, e.g., generating and animating 3D scenes from audio inputs. However, the complexity of XR environments makes it difficult to accurately extract relevant contextual data and scene/object parameters from an overwhelming volume of XR artifacts. It leads to not only increased costs with pay-per-use models, but also elevated levels of generation errors. Moreover, existing approaches focusing on coding script generation are often prone to generation errors, resulting in flawed or invalid scripts, application crashes, and ultimately a degraded user experience. To overcome these challenges, we introduce LLMER, a novel framework that creates interactive XR worlds using JSON data generated by LLMs. Unlike prior approaches focusing on coding script generation, LLMER translates natural language inputs into JSON data, significantly reducing the likelihood of application crashes and processing latency. It employs a multi-stage strategy to supply only the essential contextual information adapted to the user's request and features multiple modules designed for various XR tasks. Our preliminary user study reveals the effectiveness of the proposed system, with over 80% reduction in consumed tokens and around 60% reduction in task completion time compared to state-of-the-art approaches. The analysis of users' feedback also illuminates a series of directions for further optimization. © 1995-2012 IEEE.},
keywords = {% reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality},
pubstate = {published},
tppubtype = {article}
}
Ding, S.; Chen, Y.
RAG-VR: Leveraging Retrieval-Augmented Generation for 3D Question Answering in VR Environments Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 131–136, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833151484-6 (ISBN).
Abstract | Links | BibTeX | Tags: Ambient intelligence, Computational Linguistics, Computer interaction, Computing methodologies, Computing methodologies-Artificial intelligence-Natural language processing-Natural language generation, Computing methodology-artificial intelligence-natural language processing-natural language generation, Data handling, Formal languages, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Language Model, Language processing, Natural language generation, Natural language processing systems, Natural languages, Virtual Reality, Word processing
@inproceedings{ding_rag-vr_2025,
title = {RAG-VR: Leveraging Retrieval-Augmented Generation for 3D Question Answering in VR Environments},
author = {S. Ding and Y. Chen},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005140593&doi=10.1109%2fVRW66409.2025.00034&partnerID=40&md5=36dc5fef97aeea4d6e183c83ce9fcd89},
doi = {10.1109/VRW66409.2025.00034},
isbn = {979-833151484-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},
pages = {131–136},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Recent advances in large language models (LLMs) provide new opportunities for context understanding in virtual reality (VR). However, VR contexts are often highly localized and personalized, limiting the effectiveness of general-purpose LLMs. To address this challenge, we present RAG-VR, the first 3D question-answering system for VR that incorporates retrieval-augmented generation (RAG), which augments an LLM with external knowledge retrieved from a localized knowledge database to improve the answer quality. RAG-VR includes a pipeline for extracting comprehensive knowledge about virtual environments and user conditions for accurate answer generation. To ensure efficient retrieval, RAG-VR offloads the retrieval process to a nearby edge server and uses only essential information during retrieval. Moreover, we train the retriever to effectively distinguish among relevant, irrelevant, and hard-to-differentiate information in relation to questions. RAG-VR improves answer accuracy by 17.9%-41.8% and reduces end-to-end latency by 34.5%-47.3% compared with two baseline systems. © 2025 IEEE.},
keywords = {Ambient intelligence, Computational Linguistics, Computer interaction, Computing methodologies, Computing methodologies-Artificial intelligence-Natural language processing-Natural language generation, Computing methodology-artificial intelligence-natural language processing-natural language generation, Data handling, Formal languages, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Language Model, Language processing, Natural language generation, Natural language processing systems, Natural languages, Virtual Reality, Word processing},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
Pester, A.; Tammaa, A.; Gütl, C.; Steinmaurer, A.; El-Seoud, S. A.
Conversational Agents, Virtual Worlds, and Beyond: A Review of Large Language Models Enabling Immersive Learning Proceedings Article
In: IEEE Global Eng. Edu. Conf., EDUCON, IEEE Computer Society, 2024, ISBN: 21659559 (ISSN); 979-835039402-3 (ISBN).
Abstract | Links | BibTeX | Tags: Computational Linguistics, Computer aided instruction, Conversational Agents, Education, Immersive learning, Language Model, Large language model, Learning systems, Literature reviews, LLM, Metaverse, Metaverses, Natural language processing systems, Pedagogy, Survey literature review, Virtual Reality, Virtual worlds
@inproceedings{pester_conversational_2024,
title = {Conversational Agents, Virtual Worlds, and Beyond: A Review of Large Language Models Enabling Immersive Learning},
author = {A. Pester and A. Tammaa and C. Gütl and A. Steinmaurer and S. A. El-Seoud},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85199068668&doi=10.1109%2fEDUCON60312.2024.10578895&partnerID=40&md5=1b904fd8a5e06d7ced42a328c028bbb7},
doi = {10.1109/EDUCON60312.2024.10578895},
isbn = {21659559 (ISSN); 979-835039402-3 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {IEEE Global Eng. Edu. Conf., EDUCON},
publisher = {IEEE Computer Society},
abstract = {Large Language Models represent a significant breakthrough in Natural Language Processing research and opened a wide range of application domains. This paper demonstrates the successful integration of Large Language Models into immersive learning environments. The review highlights how this emerging technology aligns with pedagogical principles, enhancing the effectiveness of current educational systems. It also reflects recent advancements in integrating Large Language Models, including fine-tuning, hallucination reduction, fact-checking, and human evaluation of generated results. © 2024 IEEE.},
keywords = {Computational Linguistics, Computer aided instruction, Conversational Agents, Education, Immersive learning, Language Model, Large language model, Learning systems, Literature reviews, LLM, Metaverse, Metaverses, Natural language processing systems, Pedagogy, Survey literature review, Virtual Reality, Virtual worlds},
pubstate = {published},
tppubtype = {inproceedings}
}
Clocchiatti, A.; Fumero, N.; Soccini, A. M.
Character Animation Pipeline based on Latent Diffusion and Large Language Models Proceedings Article
In: Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR, pp. 398–405, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835037202-1 (ISBN).
Abstract | Links | BibTeX | Tags: Animation, Animation pipeline, Artificial intelligence, Augmented Reality, Character animation, Computational Linguistics, Computer animation, Deep learning, Diffusion, E-Learning, Extended reality, Film production, Generative art, Language Model, Learning systems, Learning techniques, Natural language processing systems, Pipelines, Production pipelines, Virtual Reality
@inproceedings{clocchiatti_character_2024,
title = {Character Animation Pipeline based on Latent Diffusion and Large Language Models},
author = {A. Clocchiatti and N. Fumero and A. M. Soccini},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85187217072&doi=10.1109%2fAIxVR59861.2024.00067&partnerID=40&md5=d88b9ba7c80d49b60fd0d7acd5e7c4f0},
doi = {10.1109/AIxVR59861.2024.00067},
isbn = {979-835037202-1 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR},
pages = {398–405},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Artificial intelligence and deep learning techniques are revolutionizing the film production pipeline. The majority of the current screenplay-to-animation pipelines focus on understanding the screenplay through natural language processing techniques, and on the generation of the animation through custom engines, missing the possibility to customize the characters. To address these issues, we propose a high-level pipeline for generating 2D characters and animations starting from screenplays, through a combination of Latent Diffusion Models and Large Language Models. Our approach uses ChatGPT to generate character descriptions starting from the screenplay. Then, using that data, it generates images of custom characters with Stable Diffusion and animates them according to their actions in different scenes. The proposed approach avoids well-known problems in generative AI tools such as temporal inconsistency and lack of control on the outcome. The results suggest that the pipeline is consistent and reliable, benefiting industries ranging from film production to virtual, augmented and extended reality content creation. © 2024 IEEE.},
keywords = {Animation, Animation pipeline, Artificial intelligence, Augmented Reality, Character animation, Computational Linguistics, Computer animation, Deep learning, Diffusion, E-Learning, Extended reality, Film production, Generative art, Language Model, Learning systems, Learning techniques, Natural language processing systems, Pipelines, Production pipelines, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Venkatachalam, N.; Rayana, M.; Vignesh, S. Bala; Prathamesh, S.
Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences Proceedings Article
In: Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT, pp. 1133–1138, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835032753-3 (ISBN).
Abstract | Links | BibTeX | Tags: Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets
@inproceedings{venkatachalam_voice-driven_2024,
title = {Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences},
author = {N. Venkatachalam and M. Rayana and S. Bala Vignesh and S. Prathamesh},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85190121845&doi=10.1109%2fIDCIoT59759.2024.10467441&partnerID=40&md5=6594fbab013d9156b79a887f0d7209cb},
doi = {10.1109/IDCIoT59759.2024.10467441},
isbn = {979-835032753-3 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT},
pages = {1133–1138},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {This research study introduces an innovative system that aims to synthesize 360-degree panoramic images in Realtime based on vocal prompts from the user, leveraging state-of-The-Art Generative AI with a combination of advanced NLP models. The primary objective of this system is to transform spoken descriptions into immersive and interactive visual scenes, specifically designed to provide users with first-person field views. This cutting-edge technology has the potential to revolutionize the realm of virtual reality (VR) experiences, enabling users to effortlessly create and navigate through personalized environments. The fundamental goal of this system is to enable the generation of real-Time images that are seamlessly compatible with VR headsets, offering a truly immersive and adaptive visual experience. Beyond its technological advancements, this research also highlights its significant potential for creating a positive social impact. One notable application lies in psychological interventions, particularly in the context of phobia treatment and therapeutic settings. Here, patients can safely confront and work through their fears within these synthesized environments, potentially offering new avenues for therapy. Furthermore, the system serves educational and entertainment purposes by bringing users' imaginations to life, providing an unparalleled platform for exploring the boundaries of virtual experiences. Overall, this research represents a promising stride towards a more immersive and adaptable future in VR technology, with the potential to enhance various aspects of human lives, from mental health treatment to entertainment and education. © 2024 IEEE.},
keywords = {Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets},
pubstate = {published},
tppubtype = {inproceedings}
}
Yin, Z.; Wang, Y.; Papatheodorou, T.; Hui, P.
Text2VRScene: Exploring the Framework of Automated Text-driven Generation System for VR Experience Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces, VR, pp. 701–711, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835037402-5 (ISBN).
Abstract | Links | BibTeX | Tags: Automated systems, Automation, Digital contents, Generation systems, Generative model, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Interaction techniques, Language Model, Natural language processing systems, Text input, User interfaces, Virtual Reality
@inproceedings{yin_text2vrscene_2024,
title = {Text2VRScene: Exploring the Framework of Automated Text-driven Generation System for VR Experience},
author = {Z. Yin and Y. Wang and T. Papatheodorou and P. Hui},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85191431035&doi=10.1109%2fVR58804.2024.00090&partnerID=40&md5=5484a5bc3939d003efe68308f56b15a6},
doi = {10.1109/VR58804.2024.00090},
isbn = {979-835037402-5 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces, VR},
pages = {701–711},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {With the recent development of the Virtual Reality (VR) industry, the increasing number of VR users pushes the demand for the massive production of immersive and expressive VR scenes in related industries. However, creating expressive VR scenes involves the reasonable organization of various digital content to express a coherent and logical theme, which is time-consuming and labor-intensive. In recent years, Large Language Models (LLMs) such as ChatGPT 3.5 and generative models such as stable diffusion have emerged as powerful tools for comprehending natural language and generating digital contents such as text, code, images, and 3D objects. In this paper, we have explored how we can generate VR scenes from text by incorporating LLMs and various generative models into an automated system. To achieve this, we first identify the possible limitations of LLMs for an automated system and propose a systematic framework to mitigate them. Subsequently, we developed Text2VRScene, a VR scene generation system, based on our proposed framework with well-designed prompts. To validate the effectiveness of our proposed framework and the designed prompts, we carry out a series of test cases. The results show that the proposed framework contributes to improving the reliability of the system and the quality of the generated VR scenes. The results also illustrate the promising performance of the Text2VRScene in generating satisfying VR scenes with a clear theme regularized by our well-designed prompts. This paper ends with a discussion about the limitations of the current system and the potential of developing similar generation systems based on our framework. © 2024 IEEE.},
keywords = {Automated systems, Automation, Digital contents, Generation systems, Generative model, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Interaction techniques, Language Model, Natural language processing systems, Text input, User interfaces, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Krauss, C.; Bassbouss, L.; Upravitelev, M.; An, T. -S.; Altun, D.; Reray, L.; Balitzki, E.; Tamimi, T. El; Karagülle, M.
Opportunities and Challenges in Developing Educational AI-Assistants for the Metaverse Proceedings Article
In: R.A., Sottilare; J., Schwarz (Ed.): Lect. Notes Comput. Sci., pp. 219–238, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 03029743 (ISSN); 978-303160608-3 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, AI-assistant, AI-Assistants, Computational Linguistics, Computer aided instruction, Concept-based, E-Learning, Education, Interoperability, Language Model, Large language model, large language models, Learning Environments, Learning systems, Learning Technologies, Learning technology, LLM, Metaverse, Metaverses, Natural language processing systems, Proof of concept, User interfaces, Virtual assistants, Virtual Reality
@inproceedings{krauss_opportunities_2024,
title = {Opportunities and Challenges in Developing Educational AI-Assistants for the Metaverse},
author = {C. Krauss and L. Bassbouss and M. Upravitelev and T. -S. An and D. Altun and L. Reray and E. Balitzki and T. El Tamimi and M. Karagülle},
editor = {Sottilare R.A. and Schwarz J.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85196214138&doi=10.1007%2f978-3-031-60609-0_16&partnerID=40&md5=9a66876cb30e9e5d287a86e6cfa66e05},
doi = {10.1007/978-3-031-60609-0_16},
isbn = {03029743 (ISSN); 978-303160608-3 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {14727 LNCS},
pages = {219–238},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {The paper explores the opportunities and challenges for metaverse learning environments with AI-Assistants based on Large Language Models. A proof of concept based on popular but proprietary technologies is presented that enables a natural language exchange between the user and an AI-based medical expert in a highly immersive environment based on the Unreal Engine. The answers generated by ChatGPT are not only played back lip-synchronously, but also visualized in the VR environment using a 3D model of a skeleton. Usability and user experience play a particularly important role in the development of the highly immersive AI-Assistant. The proof of concept serves to illustrate the opportunities and challenges that lie in the merging of large language models, metaverse applications and educational ecosystems, which are self-contained research areas. Development strategies, tools and interoperability standards will be presented to facilitate future developments in this triangle of tension. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.},
keywords = {3D modeling, AI-assistant, AI-Assistants, Computational Linguistics, Computer aided instruction, Concept-based, E-Learning, Education, Interoperability, Language Model, Large language model, large language models, Learning Environments, Learning systems, Learning Technologies, Learning technology, LLM, Metaverse, Metaverses, Natural language processing systems, Proof of concept, User interfaces, Virtual assistants, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Jeong, E.; Kim, H.; Park, S.; Yoon, S.; Ahn, J.; Woo, W.
Function-Adaptive Affordance Extraction from 3D Objects Using LLM for Interaction Authoring with Augmented Artifacts Proceedings Article
In: U., Eck; M., Sra; J., Stefanucci; M., Sugimoto; M., Tatzgern; I., Williams (Ed.): Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct, pp. 205–208, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-833150691-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, Applied computing, Art and humanity, Artificial intelligence, Arts and humanities, Augmented Reality, Computer interaction, Computer vision, Computing methodologies, computing methodology, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Humanities computing, Interaction paradigm, Interaction paradigms, Language processing, Mixed / augmented reality, Mixed reality, Modeling languages, Natural Language Processing, Natural language processing systems, Natural languages, Three dimensional computer graphics
@inproceedings{jeong_function-adaptive_2024,
title = {Function-Adaptive Affordance Extraction from 3D Objects Using LLM for Interaction Authoring with Augmented Artifacts},
author = {E. Jeong and H. Kim and S. Park and S. Yoon and J. Ahn and W. Woo},
editor = {Eck U. and Sra M. and Stefanucci J. and Sugimoto M. and Tatzgern M. and Williams I.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85214379963&doi=10.1109%2fISMAR-Adjunct64951.2024.00050&partnerID=40&md5=7222e0599a7e2aa0adaea38e4b9e13cc},
doi = {10.1109/ISMAR-Adjunct64951.2024.00050},
isbn = {979-833150691-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct},
pages = {205–208},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {We propose an algorithm that extracts the most suitable affordances, interaction targets, and corresponding coordinates adaptively from 3D models of various artifacts based on their functional context for efficient authoring of XR content with artifacts. Traditionally, authoring AR scenes to convey artifact context required one-to-one manual work. Our approach leverages a Large Language Model (LLM) to extract interaction types, positions, and subjects based on the artifact's name and usage context. This enables templated XR experience creation, replacing repetitive manual labor. Consequently, our system streamlines the XR authoring process, making it more efficient and scalable. © 2024 IEEE.},
keywords = {3D modeling, Applied computing, Art and humanity, Artificial intelligence, Arts and humanities, Augmented Reality, Computer interaction, Computer vision, Computing methodologies, computing methodology, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Humanities computing, Interaction paradigm, Interaction paradigms, Language processing, Mixed / augmented reality, Mixed reality, Modeling languages, Natural Language Processing, Natural language processing systems, Natural languages, Three dimensional computer graphics},
pubstate = {published},
tppubtype = {inproceedings}
}
Cronin, I.
Apress Media LLC, 2024, ISBN: 979-886880282-9 (ISBN); 979-886880281-2 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Augmented Reality, Autonomous system, Autonomous systems, Business applications, Computer vision, Decision making, Gaussian Splatting, Gaussians, Generative AI, Language processing, Learning algorithms, Learning systems, machine learning, Machine-learning, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Splatting
@book{cronin_understanding_2024,
title = {Understanding Generative AI Business Applications: A Guide to Technical Principles and Real-World Applications},
author = {I. Cronin},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001777571&doi=10.1007%2f979-8-8688-0282-9&partnerID=40&md5=c0714ff3e1ad755596426ea092b830d6},
doi = {10.1007/979-8-8688-0282-9},
isbn = {979-886880282-9 (ISBN); 979-886880281-2 (ISBN)},
year = {2024},
date = {2024-01-01},
publisher = {Apress Media LLC},
series = {Understanding Generative AI Business Applications: A Guide to Technical Principles and Real-World Applications},
abstract = {This guide covers the fundamental technical principles and various business applications of Generative AI for planning, developing, and evaluating AI-driven products. It equips you with the knowledge you need to harness the potential of Generative AI for enhancing business creativity and productivity. The book is organized into three sections: text-based, senses-based, and rationale-based. Each section provides an in-depth exploration of the specific methods and applications of Generative AI. In the text-based section, you will find detailed discussions on designing algorithms to automate and enhance written communication, including insights into the technical aspects of transformer-based Natural Language Processing (NLP) and chatbot architecture, such as GPT-4, Claude 2, Google Bard, and others. The senses-based section offers a glimpse into the algorithms and data structures that underpin visual, auditory, and multisensory experiences, including NeRF, 3D Gaussian Splatting, Stable Diffusion, AR and VR technologies, and more. The rationale-based section illuminates the decision-making capabilities of AI, with a focus on machine learning and data analytics techniques that empower applications such as simulation models, agents, and autonomous systems. In summary, this book serves as a guide for those seeking to navigate the dynamic landscape of Generative AI. Whether you’re a seasoned AI professional or a business leader looking to harness the power of creative automation, these pages offer a roadmap to leverage Generative AI for your organization’s success. © 2024 by Irena Cronin.},
keywords = {Artificial intelligence, Augmented Reality, Autonomous system, Autonomous systems, Business applications, Computer vision, Decision making, Gaussian Splatting, Gaussians, Generative AI, Language processing, Learning algorithms, Learning systems, machine learning, Machine-learning, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Splatting},
pubstate = {published},
tppubtype = {book}
}
Kapadia, N.; Gokhale, S.; Nepomuceno, A.; Cheng, W.; Bothwell, S.; Mathews, M.; Shallat, J. S.; Schultz, C.; Gupta, A.
Evaluation of Large Language Model Generated Dialogues for an AI Based VR Nurse Training Simulator Proceedings Article
In: J.Y.C., Chen; G., Fragomeni (Ed.): Lect. Notes Comput. Sci., pp. 200–212, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 03029743 (ISSN); 978-303161040-0 (ISBN).
Abstract | Links | BibTeX | Tags: Bard, ChatGPT, ClaudeAI, Clinical research, Computational Linguistics, Dialogue Generation, Dialogue generations, Education computing, Extended reality, Health care education, Healthcare Education, Language Model, Language processing, Large language model, large language models, Natural Language Processing, Natural language processing systems, Natural languages, Nurse Training Simulation, Nursing, Patient avatar, Patient Avatars, Semantics, Students, Training simulation, Virtual Reality
@inproceedings{kapadia_evaluation_2024,
title = {Evaluation of Large Language Model Generated Dialogues for an AI Based VR Nurse Training Simulator},
author = {N. Kapadia and S. Gokhale and A. Nepomuceno and W. Cheng and S. Bothwell and M. Mathews and J. S. Shallat and C. Schultz and A. Gupta},
editor = {Chen J.Y.C. and Fragomeni G.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85196200653&doi=10.1007%2f978-3-031-61041-7_13&partnerID=40&md5=8890a8d0c289fdf6e7ab82e105249097},
doi = {10.1007/978-3-031-61041-7_13},
isbn = {03029743 (ISSN); 978-303161040-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {14706 LNCS},
pages = {200–212},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {This paper explores the efficacy of Large Language Models (LLMs) in generating dialogues for patient avatars in Virtual Reality (VR) nurse training simulators. With the integration of technology in healthcare education evolving rapidly, the potential of NLP to enhance nurse training through realistic patient interactions presents a significant opportunity. Our study introduces a novel LLM-based dialogue generation system, leveraging models such as ChatGPT, GoogleBard, and ClaudeAI. We detail the development of our script generation system, which was a collaborative endeavor involving nurses, technical artists, and developers. The system, tested on the Meta Quest 2 VR headset, integrates complex dialogues created through a synthesis of clinical expertise and advanced NLP, aimed at simulating real-world nursing scenarios. Through a comprehensive evaluation involving lexical and semantic similarity tests compared to clinical expert-generated scripts, we assess the potential of LLMs as suitable alternatives for script generation. The findings aim to contribute to the development of a more interactive and effective VR nurse training simulator, enhancing communication skills among nursing students for improved patient care outcomes. This research underscores the importance of advanced NLP applications in healthcare education, offering insights into the practicality and limitations of employing LLMs in clinical training environments. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.},
keywords = {Bard, ChatGPT, ClaudeAI, Clinical research, Computational Linguistics, Dialogue Generation, Dialogue generations, Education computing, Extended reality, Health care education, Healthcare Education, Language Model, Language processing, Large language model, large language models, Natural Language Processing, Natural language processing systems, Natural languages, Nurse Training Simulation, Nursing, Patient avatar, Patient Avatars, Semantics, Students, Training simulation, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Geurts, E.; Warson, D.; Ruiz, G. Rovelo
Boosting Motivation in Sports with Data-Driven Visualizations in VR Proceedings Article
In: ACM Int. Conf. Proc. Ser., Association for Computing Machinery, 2024, ISBN: 979-840071764-2 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Asynchronoi social interaction, Asynchronous social interaction, Cycling, Data driven, Dynamics, Extended reality, Group dynamics, Language Model, Large language model, large language models, Motivation, Natural language processing systems, Real-world, Real-world data, Social interactions, Sports, User interface, User interfaces, Virtual Reality, Visualization, Visualizations
@inproceedings{geurts_boosting_2024,
title = {Boosting Motivation in Sports with Data-Driven Visualizations in VR},
author = {E. Geurts and D. Warson and G. Rovelo Ruiz},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85195387493&doi=10.1145%2f3656650.3656669&partnerID=40&md5=ec69e7abe61e572a94261ad6bbfed11c},
doi = {10.1145/3656650.3656669},
isbn = {979-840071764-2 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {ACM Int. Conf. Proc. Ser.},
publisher = {Association for Computing Machinery},
abstract = {In recent years, the integration of Artificial Intelligence (AI) has sparked revolutionary progress across diverse domains, with sports applications being no exception. At the same time, using real-world data sources, such as GPS, weather, and traffic data, offers opportunities to improve the overall user engagement and effectiveness of such applications. Despite the substantial advancements, including proven success in mobile applications, there remains an untapped potential in leveraging these technologies to boost motivation and enhance social group dynamics in Virtual Reality (VR) sports solutions. Our innovative approach focuses on harnessing the power of AI and real-world data to facilitate the design of such VR systems. To validate our methodology, we conducted an exploratory study involving 18 participants, evaluating our approach within the context of indoor VR cycling. By incorporating GPX files and omnidirectional video (real-world data), we recreated a lifelike cycling environment in which users can compete with simulated cyclists navigating a chosen (real-world) route. Considering the user's performance and interactions with other cyclists, our system employs AI-driven natural language processing tools to generate encouraging and competitive messages automatically. The outcome of our study reveals a positive impact on motivation, competition dynamics, and the perceived sense of group dynamics when using real performance data alongside automatically generated motivational messages. This underscores the potential of AI-driven enhancements in user interfaces to not only optimize performance but also foster a more engaging and supportive sports environment. © 2024 ACM.},
keywords = {Artificial intelligence, Asynchronoi social interaction, Asynchronous social interaction, Cycling, Data driven, Dynamics, Extended reality, Group dynamics, Language Model, Large language model, large language models, Motivation, Natural language processing systems, Real-world, Real-world data, Social interactions, Sports, User interface, User interfaces, Virtual Reality, Visualization, Visualizations},
pubstate = {published},
tppubtype = {inproceedings}
}
Tang, Y.; Situ, J.; Huang, Y.
Beyond User Experience: Technical and Contextual Metrics for Large Language Models in Extended Reality Proceedings Article
In: UbiComp Companion - Companion ACM Int. Jt. Conf. Pervasive Ubiquitous Comput., pp. 640–643, Association for Computing Machinery, Inc, 2024, ISBN: 979-840071058-2 (ISBN).
Abstract | Links | BibTeX | Tags: Augmented Reality, Computer simulation languages, Evaluation Metrics, Extended reality, Language Model, Large language model, large language models, Mixed reality, Modeling performance, Natural language processing systems, Physical world, Spatial computing, spatial data, user experience, Users' experiences, Virtual environments, Virtual Reality
@inproceedings{tang_beyond_2024,
title = {Beyond User Experience: Technical and Contextual Metrics for Large Language Models in Extended Reality},
author = {Y. Tang and J. Situ and Y. Huang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85206203437&doi=10.1145%2f3675094.3678995&partnerID=40&md5=3fb337872b483a163bfbea038f1baffe},
doi = {10.1145/3675094.3678995},
isbn = {979-840071058-2 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {UbiComp Companion - Companion ACM Int. Jt. Conf. Pervasive Ubiquitous Comput.},
pages = {640–643},
publisher = {Association for Computing Machinery, Inc},
abstract = {Spatial Computing involves interacting with the physical world through spatial data manipulation, closely linked with Extended Reality (XR), which includes Virtual Reality (VR), Augmented Reality (AR), and Mixed Reality (MR). Large Language Models (LLMs) significantly enhance XR applications by improving user interactions through natural language understanding and content generation. Typical evaluations of these applications focus on user experience (UX) metrics, such as task performance, user satisfaction, and psychological assessments, but often neglect the technical performance of the LLMs themselves. This paper identifies significant gaps in current evaluation practices for LLMs within XR environments, attributing them to the novelty of the field, the complexity of spatial contexts, and the multimodal nature of interactions in XR. To address these gaps, the paper proposes specific metrics tailored to evaluate LLM performance in XR contexts, including spatial contextual awareness, coherence, proactivity, multimodal integration, hallucination, and question-answering accuracy. These proposed metrics aim to complement existing UX evaluations, providing a comprehensive assessment framework that captures both the technical and user-centric aspects of LLM performance in XR applications. The conclusion underscores the necessity for a dual-focused approach that combines technical and UX metrics to ensure effective and user-friendly LLM-integrated XR systems. © 2024 Copyright held by the owner/author(s).},
keywords = {Augmented Reality, Computer simulation languages, Evaluation Metrics, Extended reality, Language Model, Large language model, large language models, Mixed reality, Modeling performance, Natural language processing systems, Physical world, Spatial computing, spatial data, user experience, Users' experiences, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Fuchs, A.; Appel, S.; Grimm, P.
Immersive Spaces for Creativity: Smart Working Environments Proceedings Article
In: A.A., Yunanto; A.D., Ramadhani; Y.R., Prayogi; P.A.M., Putra; M., Ruswiansari; M., Ridwan; F., Gamar; W.M., Rahmawati; M.R., Rusli; F.M., Humaira; A.F., Adila (Ed.): IES - Int. Electron. Symp.: Unlocking Potential Immersive Technol. Live Better Life, Proceeding, pp. 610–617, Institute of Electrical and Electronics Engineers Inc., 2023, ISBN: 979-835031473-1 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Generative AI, Human computer interaction, Immersive, Innovative approaches, Intelligent systems, Interactive Environments, Language Model, Language processing, Large language model, large language models, Learning algorithms, machine learning, Natural language processing systems, Natural languages, User behaviors, User interfaces, Virtual Reality, Working environment
@inproceedings{fuchs_immersive_2023,
title = {Immersive Spaces for Creativity: Smart Working Environments},
author = {A. Fuchs and S. Appel and P. Grimm},
editor = {Yunanto A.A. and Ramadhani A.D. and Prayogi Y.R. and Putra P.A.M. and Ruswiansari M. and Ridwan M. and Gamar F. and Rahmawati W.M. and Rusli M.R. and Humaira F.M. and Adila A.F.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85173627291&doi=10.1109%2fIES59143.2023.10242458&partnerID=40&md5=6ab1796f68c29d7747574272314a2e9d},
doi = {10.1109/IES59143.2023.10242458},
isbn = {979-835031473-1 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {IES - Int. Electron. Symp.: Unlocking Potential Immersive Technol. Live Better Life, Proceeding},
pages = {610–617},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {This paper presents an innovative approach to designing an immersive space that dynamically supports users (inter-)action based on users' behavior, voice, and mood, providing a personalized experience. The objective of this research is to explore how a space can communicate with users in a seamless, engaging, and interactive environment. Therefore, it integrates natural language processing (NLP), generative artificial intelligence applications and human computer interaction that utilizes a combination of sensors, microphones, and cameras to collect real-time data on users' behavior, voice, and mood. This data is then processed and analyzed by an intelligent system that employs machine learning algorithms to identify patterns and adapt the environment accordingly. The adaptive features include changes in lighting, sound, and visual elements to facilitate creativity, focus, relaxation, or socialization, depending on the user's topics and emotional state. The paper discusses the technical aspects of implementing such a system. Additionally, it highlights the potential applications of this technology in various domains such as education, entertainment, and workplace settings. In conclusion, the immersive creative space represents a paradigm shift in human-environment interaction, offering a dynamic and personalized space that caters to the diverse needs of users. The research findings suggest that this innovative approach holds great promise for enhancing user experiences, fostering creativity, and promoting overall well-being. © 2023 IEEE.},
keywords = {Artificial intelligence, Generative AI, Human computer interaction, Immersive, Innovative approaches, Intelligent systems, Interactive Environments, Language Model, Language processing, Large language model, large language models, Learning algorithms, machine learning, Natural language processing systems, Natural languages, User behaviors, User interfaces, Virtual Reality, Working environment},
pubstate = {published},
tppubtype = {inproceedings}
}
Joseph, S.; Priya, B. S.; Poorvaja, R.; Kumaran, M. Santhosh; Shivaraj, S.; Jeyanth, V.; Shivesh, R. P.
IoT Empowered AI: Transforming Object Recognition and NLP Summarization with Generative AI Proceedings Article
In: K.V., Arya; T., Wada (Ed.): Proc. IEEE Int. Conf. Comput. Vis. Mach. Intell., CVMI, Institute of Electrical and Electronics Engineers Inc., 2023, ISBN: 979-835030514-2 (ISBN).
Abstract | Links | BibTeX | Tags: 2D, 3D, Application program interface, Application Program Interface (API), Application program interfaces, Application programming interfaces (API), Application programs, Augmented Reality, Augmented Reality(AR), Automation, Cameras, Cost effectiveness, Domestic appliances, GenAl, Internet of Things, Internet of Things (IoT) technologies, Internet of things technologies, Language processing, Natural Language Processing, Natural language processing systems, Natural languages, Object Detection, Object recognition, Objects detection, Optical character recognition, Optical Character Recognition (OCR), Smartphones
@inproceedings{joseph_iot_2023,
title = {IoT Empowered AI: Transforming Object Recognition and NLP Summarization with Generative AI},
author = {S. Joseph and B. S. Priya and R. Poorvaja and M. Santhosh Kumaran and S. Shivaraj and V. Jeyanth and R. P. Shivesh},
editor = {Arya K.V. and Wada T.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85189754688&doi=10.1109%2fCVMI59935.2023.10465077&partnerID=40&md5=9c1a9d7151c0b04bab83586f515d30aa},
doi = {10.1109/CVMI59935.2023.10465077},
isbn = {979-835030514-2 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Proc. IEEE Int. Conf. Comput. Vis. Mach. Intell., CVMI},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {In anticipation of the widespread adoption of augmented reality in the future, this paper introduces an advanced mobile application that seamlessly integrates AR and IoT technologies. The application aims to make these cutting-edge technologies more affordable and accessible to users while highlighting their immense benefits in assisting with household appliance control, as well as providing interactive and educational experiences. The app employs advanced algorithms such as object detection, Natural Language Processing (NLP), and Optical Character Recognition (OCR) to scan the smartphone's camera feed. Upon identification, AR controls for appliances, their power consumption, and electric bill tracking are displayed. Additionally, the application makes use of APIs to access the internet, retrieving relevant 3D generative models, 360-degree videos, 2D images, and textual information based on user interactions with detected objects. Users can effortlessly explore and interact with the 3D generative models using intuitive hand gestures, providing an immersive experience without the need for additional hardware or dedicated VR headsets. Beyond home automation, the app offers valuable educational benefits, serving as a unique learning tool for students to gain hands-on experience. Medical practitioners can quickly reference organ anatomy and utilize its feature-rich functionalities. Its cost-effectiveness, requiring only installation, ensures accessibility to a wide audience. The app's functionality is both intuitive and efficient, detecting objects in the camera feed and prompting user interactions. Users can select objects through simple hand gestures, choosing desired content like 3D generative models, 2D images, textual information, 360-degree videos, or shopping-related details. The app then retrieves and overlays the requested information onto the real-world view in AR. In conclusion, this groundbreaking AR and IoT -powered app revolutionizes home automation and learning experiences, leveraging only a smartphone's camera, without the need for additional hardware or expensive installations. Its potential applications extend to education, industries, and health care, making it a versatile and valuable tool for a broad range of users. © 2023 IEEE.},
keywords = {2D, 3D, Application program interface, Application Program Interface (API), Application program interfaces, Application programming interfaces (API), Application programs, Augmented Reality, Augmented Reality(AR), Automation, Cameras, Cost effectiveness, Domestic appliances, GenAl, Internet of Things, Internet of Things (IoT) technologies, Internet of things technologies, Language processing, Natural Language Processing, Natural language processing systems, Natural languages, Object Detection, Object recognition, Objects detection, Optical character recognition, Optical Character Recognition (OCR), Smartphones},
pubstate = {published},
tppubtype = {inproceedings}
}
DeChant, C.; Akinola, I.; Bauer, D.
Learning to summarize and answer questions about a virtual robot’s past actions Journal Article
In: Autonomous Robots, vol. 47, no. 8, pp. 1103–1118, 2023, ISSN: 09295593 (ISSN).
Abstract | Links | BibTeX | Tags: Action sequences, E-Learning, Interpretability, Language Model, Long horizon task, Long horizon tasks, Natural language processing systems, Natural languages, Question Answering, Representation learning, Robots, Summarization, Video frame, Virtual Reality, Virtual robots, Zero-shot learning
@article{dechant_learning_2023,
title = {Learning to summarize and answer questions about a virtual robot’s past actions},
author = {C. DeChant and I. Akinola and D. Bauer},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85176588341&doi=10.1007%2fs10514-023-10134-4&partnerID=40&md5=162b3343d5f000f2b79f59c339f99022},
doi = {10.1007/s10514-023-10134-4},
issn = {09295593 (ISSN)},
year = {2023},
date = {2023-01-01},
journal = {Autonomous Robots},
volume = {47},
number = {8},
pages = {1103–1118},
abstract = {When robots perform long action sequences, users will want to easily and reliably find out what they have done. We therefore demonstrate the task of learning to summarize and answer questions about a robot agent’s past actions using natural language alone. A single system with a large language model at its core is trained to both summarize and answer questions about action sequences given ego-centric video frames of a virtual robot and a question prompt. To enable training of question answering, we develop a method to automatically generate English-language questions and answers about objects, actions, and the temporal order in which actions occurred during episodes of robot action in the virtual environment. Training one model to both summarize and answer questions enables zero-shot transfer of representations of objects learned through question answering to improved action summarization. © 2023, The Author(s).},
keywords = {Action sequences, E-Learning, Interpretability, Language Model, Long horizon task, Long horizon tasks, Natural language processing systems, Natural languages, Question Answering, Representation learning, Robots, Summarization, Video frame, Virtual Reality, Virtual robots, Zero-shot learning},
pubstate = {published},
tppubtype = {article}
}