AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Tortora, A.; Amaro, I.; Greca, A. Della; Barra, P.
Exploring the Role of Generative Artificial Intelligence in Virtual Reality: Opportunities and Future Perspectives Proceedings Article
In: J.Y.C., Chen; G., Fragomeni (Ed.): Lect. Notes Comput. Sci., pp. 125–142, Springer Science and Business Media Deutschland GmbH, 2025, ISBN: 03029743 (ISSN); 978-303193699-9 (ISBN).
Abstract | Links | BibTeX | Tags: Ethical technology, Future perspectives, Generative AI, Image modeling, Immersive, immersive experience, Immersive Experiences, Information Management, Language Model, Personnel training, Professional training, Real- time, Sensitive data, Training design, Users' experiences, Virtual Reality
@inproceedings{tortora_exploring_2025,
title = {Exploring the Role of Generative Artificial Intelligence in Virtual Reality: Opportunities and Future Perspectives},
author = {A. Tortora and I. Amaro and A. Della Greca and P. Barra},
editor = {Chen J.Y.C. and Fragomeni G.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007788684&doi=10.1007%2f978-3-031-93700-2_9&partnerID=40&md5=7b69183bbf8172f9595f939254fb6831},
doi = {10.1007/978-3-031-93700-2_9},
isbn = {03029743 (ISSN); 978-303193699-9 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {15788 LNCS},
pages = {125–142},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {In recent years, generative AI, such as language and image models, have started to revolutionize virtual reality (VR) by offering new opportunities for immersive and personalized interaction. This paper explores the potential of these Intelligent Augmentation technologies in the context of VR, analyzing how the generation of text and images in real time can enhance the user experience through dynamic and personalized environments and contents. The integration of generative AI in VR scenarios holds promise in multiple fields, including education, professional training, design, and healthcare. However, their implementation involves significant challenges, such as privacy management, data security, and ethical issues related to cognitive manipulation and representation of reality. Through an overview of current applications and future prospects, this paper highlights the crucial role of generative AI in enhancing VR, helping to outline a path for the ethical and sustainable development of these immersive technologies. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2025.},
keywords = {Ethical technology, Future perspectives, Generative AI, Image modeling, Immersive, immersive experience, Immersive Experiences, Information Management, Language Model, Personnel training, Professional training, Real- time, Sensitive data, Training design, Users' experiences, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Casas, L.; Hannah, S.; Mitchell, K.
HoloJig: Interactive Spoken Prompt Specified Generative AI Environments Journal Article
In: IEEE Computer Graphics and Applications, vol. 45, no. 2, pp. 69–77, 2025, ISSN: 02721716 (ISSN).
Abstract | Links | BibTeX | Tags: 3-D rendering, Article, Collaborative workspace, customer experience, Economic and social effects, generative artificial intelligence, human, Immersive, Immersive environment, parallax, Real- time, simulation, Simulation training, speech, Time based, Virtual environments, Virtual Reality, Virtual reality experiences, Virtual spaces, VR systems
@article{casas_holojig_2025,
title = {HoloJig: Interactive Spoken Prompt Specified Generative AI Environments},
author = {L. Casas and S. Hannah and K. Mitchell},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001182100&doi=10.1109%2fMCG.2025.3553780&partnerID=40&md5=ec5dc44023314b6f9221169357d81dcd},
doi = {10.1109/MCG.2025.3553780},
issn = {02721716 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Computer Graphics and Applications},
volume = {45},
number = {2},
pages = {69–77},
abstract = {HoloJig offers an interactive, speech-to-virtual reality (VR), VR experience that generates diverse environments in real time based on live spoken descriptions. Unlike traditional VR systems that rely on prebuilt assets, HoloJig dynamically creates personalized and immersive virtual spaces with depth-based parallax 3-D rendering, allowing users to define the characteristics of their immersive environment through verbal prompts. This generative approach opens up new possibilities for interactive experiences, including simulations, training, collaborative workspaces, and entertainment. In addition to speech-to-VR environment generation, a key innovation of HoloJig is its progressive visual transition mechanism, which smoothly dissolves between previously generated and newly requested environments, mitigating the delay caused by neural computations. This feature ensures a seamless and continuous user experience, even as new scenes are being rendered on remote servers. © 1981-2012 IEEE.},
keywords = {3-D rendering, Article, Collaborative workspace, customer experience, Economic and social effects, generative artificial intelligence, human, Immersive, Immersive environment, parallax, Real- time, simulation, Simulation training, speech, Time based, Virtual environments, Virtual Reality, Virtual reality experiences, Virtual spaces, VR systems},
pubstate = {published},
tppubtype = {article}
}
Behravan, M.; Gračanin, D.
From Voices to Worlds: Developing an AI-Powered Framework for 3D Object Generation in Augmented Reality Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 150–155, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833151484-6 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D object, 3D Object Generation, 3D reconstruction, Augmented Reality, Cutting edges, Generative AI, Interactive computer systems, Language Model, Large language model, large language models, matrix, Multilingual speech interaction, Real- time, Speech enhancement, Speech interaction, Volume Rendering
@inproceedings{behravan_voices_2025,
title = {From Voices to Worlds: Developing an AI-Powered Framework for 3D Object Generation in Augmented Reality},
author = {M. Behravan and D. Gračanin},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005153589&doi=10.1109%2fVRW66409.2025.00038&partnerID=40&md5=b8aaab4e2378cde3595d98d79266d371},
doi = {10.1109/VRW66409.2025.00038},
isbn = {979-833151484-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},
pages = {150–155},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {This paper presents Matrix, an advanced AI-powered framework designed for real-time 3D object generation in Augmented Reality (AR) environments. By integrating a cutting-edge text-to-3D generative AI model, multilingual speech-to-text translation, and large language models (LLMs), the system enables seamless user interactions through spoken commands. The framework processes speech inputs, generates 3D objects, and provides object recommendations based on contextual understanding, enhancing AR experiences. A key feature of this framework is its ability to optimize 3D models by reducing mesh complexity, resulting in significantly smaller file sizes and faster processing on resource-constrained AR devices. Our approach addresses the challenges of high GPU usage, large model output sizes, and real-time system responsiveness, ensuring a smoother user experience. Moreover, the system is equipped with a pre-generated object repository, further reducing GPU load and improving efficiency. We demonstrate the practical applications of this framework in various fields such as education, design, and accessibility, and discuss future enhancements including image-to-3D conversion, environmental object detection, and multimodal support. The open-source nature of the framework promotes ongoing innovation and its utility across diverse industries. © 2025 IEEE.},
keywords = {3D modeling, 3D object, 3D Object Generation, 3D reconstruction, Augmented Reality, Cutting edges, Generative AI, Interactive computer systems, Language Model, Large language model, large language models, matrix, Multilingual speech interaction, Real- time, Speech enhancement, Speech interaction, Volume Rendering},
pubstate = {published},
tppubtype = {inproceedings}
}
Behravan, M.; Haghani, M.; Gračanin, D.
Transcending Dimensions Using Generative AI: Real-Time 3D Model Generation in Augmented Reality Proceedings Article
In: J.Y.C., Chen; G., Fragomeni (Ed.): Lect. Notes Comput. Sci., pp. 13–32, Springer Science and Business Media Deutschland GmbH, 2025, ISBN: 03029743 (ISSN); 978-303193699-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D Model Generation, 3D modeling, 3D models, 3d-modeling, Augmented Reality, Generative AI, Image-to-3D conversion, Model generation, Object Detection, Object recognition, Objects detection, Real- time, Specialized software, Technical expertise, Three dimensional computer graphics, Usability engineering
@inproceedings{behravan_transcending_2025,
title = {Transcending Dimensions Using Generative AI: Real-Time 3D Model Generation in Augmented Reality},
author = {M. Behravan and M. Haghani and D. Gračanin},
editor = {Chen J.Y.C. and Fragomeni G.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007690904&doi=10.1007%2f978-3-031-93700-2_2&partnerID=40&md5=1c4d643aad88d08cbbc9dd2c02413f10},
doi = {10.1007/978-3-031-93700-2_2},
isbn = {03029743 (ISSN); 978-303193699-9 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {15788 LNCS},
pages = {13–32},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {Traditional 3D modeling requires technical expertise, specialized software, and time-intensive processes, making it inaccessible for many users. Our research aims to lower these barriers by combining generative AI and augmented reality (AR) into a cohesive system that allows users to easily generate, manipulate, and interact with 3D models in real time, directly within AR environments. Utilizing cutting-edge AI models like Shap-E, we address the complex challenges of transforming 2D images into 3D representations in AR environments. Key challenges such as object isolation, handling intricate backgrounds, and achieving seamless user interaction are tackled through advanced object detection methods, such as Mask R-CNN. Evaluation results from 35 participants reveal an overall System Usability Scale (SUS) score of 69.64, with participants who engaged with AR/VR technologies more frequently rating the system significantly higher, at 80.71. This research is particularly relevant for applications in gaming, education, and AR-based e-commerce, offering intuitive, model creation for users without specialized skills. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2025.},
keywords = {3D Model Generation, 3D modeling, 3D models, 3d-modeling, Augmented Reality, Generative AI, Image-to-3D conversion, Model generation, Object Detection, Object recognition, Objects detection, Real- time, Specialized software, Technical expertise, Three dimensional computer graphics, Usability engineering},
pubstate = {published},
tppubtype = {inproceedings}
}
Afzal, M. Z.; Ali, S. K. A.; Stricker, D.; Eisert, P.; Hilsmann, A.; Perez-Marcos, D.; Bianchi, M.; Crottaz-Herbette, S.; Ioris, R. De; Mangina, E.; Sanguineti, M.; Salaberria, A.; Lacalle, O. Lopez De; Garcia-Pablos, A.; Cuadros, M.
Next Generation XR Systems - Large Language Models Meet Augmented and Virtual Reality Journal Article
In: IEEE Computer Graphics and Applications, vol. 45, no. 1, pp. 43–55, 2025, ISSN: 02721716 (ISSN).
Abstract | Links | BibTeX | Tags: adult, Article, Augmented and virtual realities, Augmented Reality, Awareness, Context-Aware, human, Information Retrieval, Knowledge model, Knowledge reasoning, Knowledge retrieval, Language Model, Large language model, Mixed reality, neurorehabilitation, Position papers, privacy, Real- time, Reasoning, Situational awareness, Virtual environments, Virtual Reality
@article{afzal_next_2025,
title = {Next Generation XR Systems - Large Language Models Meet Augmented and Virtual Reality},
author = {M. Z. Afzal and S. K. A. Ali and D. Stricker and P. Eisert and A. Hilsmann and D. Perez-Marcos and M. Bianchi and S. Crottaz-Herbette and R. De Ioris and E. Mangina and M. Sanguineti and A. Salaberria and O. Lopez De Lacalle and A. Garcia-Pablos and M. Cuadros},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003598602&doi=10.1109%2fMCG.2025.3548554&partnerID=40&md5=b709a0c8cf47cc55a52cea73eb9ef15d},
doi = {10.1109/MCG.2025.3548554},
issn = {02721716 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Computer Graphics and Applications},
volume = {45},
number = {1},
pages = {43–55},
abstract = {Extended reality (XR) is evolving rapidly, offering new paradigms for human-computer interaction. This position paper argues that integrating large language models (LLMs) with XR systems represents a fundamental shift toward more intelligent, context-aware, and adaptive mixed-reality experiences. We propose a structured framework built on three key pillars: first, perception and situational awareness, second, knowledge modeling and reasoning, and third, visualization and interaction. We believe leveraging LLMs within XR environments enables enhanced situational awareness, real-time knowledge retrieval, and dynamic user interaction, surpassing traditional XR capabilities. We highlight the potential of this integration in neurorehabilitation, safety training, and architectural design while underscoring ethical considerations, such as privacy, transparency, and inclusivity. This vision aims to spark discussion and drive research toward more intelligent, human-centric XR systems. © 2025 IEEE.},
keywords = {adult, Article, Augmented and virtual realities, Augmented Reality, Awareness, Context-Aware, human, Information Retrieval, Knowledge model, Knowledge reasoning, Knowledge retrieval, Language Model, Large language model, Mixed reality, neurorehabilitation, Position papers, privacy, Real- time, Reasoning, Situational awareness, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {article}
}
Zhou, J.; Weber, R.; Wen, E.; Lottridge, D.
Real-Time Full-body Interaction with AI Dance Models: Responsiveness to Contemporary Dance Proceedings Article
In: Int Conf Intell User Interfaces Proc IUI, pp. 1177–1187, Association for Computing Machinery, 2025, ISBN: 979-840071306-4 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, Chatbots, Computer interaction, Deep learning, Deep-Learning Dance Model, Design of Human-Computer Interaction, Digital elevation model, Generative AI, Input output programs, Input sequence, Interactivity, Motion capture, Motion tracking, Movement analysis, Output sequences, Problem oriented languages, Real- time, Text mining, Three dimensional computer graphics, User input, Virtual environments, Virtual Reality
@inproceedings{zhou_real-time_2025,
title = {Real-Time Full-body Interaction with AI Dance Models: Responsiveness to Contemporary Dance},
author = {J. Zhou and R. Weber and E. Wen and D. Lottridge},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001922427&doi=10.1145%2f3708359.3712077&partnerID=40&md5=cea9213198220480b80b7a4840d26ccc},
doi = {10.1145/3708359.3712077},
isbn = {979-840071306-4 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Int Conf Intell User Interfaces Proc IUI},
pages = {1177–1187},
publisher = {Association for Computing Machinery},
abstract = {Interactive AI chatbots put the power of Large-Language Models (LLMs) into people's hands; it is this interactivity that fueled explosive worldwide influence. In the generative dance space, however, there are few deep-learning-based generative dance models built with interactivity in mind. The release of the AIST++ dance dataset in 2021 led to an uptick of capabilities in generative dance models. Whether these models could be adapted to support interactivity and how well this approach will work is not known. In this study, we explore the capabilities of existing generative dance models for motion-to-motion synthesis on real-time, full-body motion-captured contemporary dance data. We identify an existing model that we adapted to support interactivity: the Bailando++ model, which is trained on the AIST++ dataset and was modified to take music and a motion sequence as input parameters in an interactive loop. We worked with two professional contemporary choreographers and dancers to record and curate a diverse set of 203 motion-captured dance sequences as a set of "user inputs"captured through the Optitrack high-precision motion capture 3D tracking system. We extracted 17 quantitative movement features from the motion data using the well-established Laban Movement Analysis theory, which allowed for quantitative comparisons of inter-movement correlations, which we used for clustering input data and comparing input and output sequences. A total of 10 pieces of music were used to generate a variety of outputs using the adapted Bailando++ model. We found that, on average, the generated output motion achieved only moderate correlations to the user input, with some exceptions of movement and music pairs achieving high correlation. The high-correlation generated output sequences were deemed responsive and relevant co-creations in relation to the input sequences. We discuss implications for interactive generative dance agents, where the use of 3D joint coordinate data should be used over SMPL parameters for ease of real-time generation, and how the use of Laban Movement Analysis could be used to extract useful features and fine-tune deep-learning models. © 2025 Copyright held by the owner/author(s).},
keywords = {3D modeling, Chatbots, Computer interaction, Deep learning, Deep-Learning Dance Model, Design of Human-Computer Interaction, Digital elevation model, Generative AI, Input output programs, Input sequence, Interactivity, Motion capture, Motion tracking, Movement analysis, Output sequences, Problem oriented languages, Real- time, Text mining, Three dimensional computer graphics, User input, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
Xi, M.; Perera, M.; Matthews, B.; Wang, R.; Weiley, V.; Somarathna, R.; Maqbool, H.; Chen, J.; Engelke, U.; Anderson, S.; Adcock, M.; Thomas, B. H.
Towards Immersive AI Proceedings Article
In: U., Eck; M., Sra; J., Stefanucci; M., Sugimoto; M., Tatzgern; I., Williams (Ed.): Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct, pp. 260–264, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-833150691-9 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Augmented Reality, Data visualization, Decision making, Heterogenous data, Immersive, Immersive analytic, Immersive analytics, Industrial research, Mixed reality, Neuro-symbolic system, Real- time, Scientific paradigm, Situated imaging., Time-interleaved, Visual analytics, Work-flows
@inproceedings{xi_towards_2024,
title = {Towards Immersive AI},
author = {M. Xi and M. Perera and B. Matthews and R. Wang and V. Weiley and R. Somarathna and H. Maqbool and J. Chen and U. Engelke and S. Anderson and M. Adcock and B. H. Thomas},
editor = {Eck U. and Sra M. and Stefanucci J. and Sugimoto M. and Tatzgern M. and Williams I.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85214375967&doi=10.1109%2fISMAR-Adjunct64951.2024.00062&partnerID=40&md5=fd07c97119d71418bb4365582b1d188c},
doi = {10.1109/ISMAR-Adjunct64951.2024.00062},
isbn = {979-833150691-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct},
pages = {260–264},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {With every shift in scientific paradigms comes not only a new way of seeing the world, but as Kunh argues, new tools for seeing [13]. Today, generative AI and neuro-symbolic systems show signs of changing how science functions, making it possible to synthesise complex heterogenous data in real time, interleaved with complex and situated workflows. But the new tools are not yet fully formed. To realise the opportunities and meet the challenges posed by the growth of generative AI for science and other knowledge work requires us to look beyond improvements in algorithms. The decision-making landscape for information workers has drastically changed, and the pressing need for analysts and experts to collaborate with AI in complex, high-tempo data environments has never been more evident.To bring strategic focus to these challenges in ways that will enable social, environmental and economic benefits for all, CSIRO's Data61 (the data and digital specialist arm of the Commonwealth Scientific and Industrial Research Organisation - Australia's national science agency) has established the Immersive AI Research Cluster. The cluster allows more than 30 research scientists and engineers to focus on defining a broad range of scientific disciplines for people to work with and understand the information provided by AI, such as data visualisation, visual analytics, connecting remote people, through immersive technologies like virtual and augmented reality. This workshop paper presents the trending research directions and challenges that emerged from this research cluster, which are closely linked to the scientific domains and illustrated through use cases. © 2024 IEEE.},
keywords = {Artificial intelligence, Augmented Reality, Data visualization, Decision making, Heterogenous data, Immersive, Immersive analytic, Immersive analytics, Industrial research, Mixed reality, Neuro-symbolic system, Real- time, Scientific paradigm, Situated imaging., Time-interleaved, Visual analytics, Work-flows},
pubstate = {published},
tppubtype = {inproceedings}
}
Ansari, U.; Qureshi, H. A.; Soomro, N. A.; Memon, A. R.
Augmented Reality-Driven Reservoir Management Via Generative Ai: Transforming Pore-Scale Fluid Flow Simulation Proceedings Article
In: Soc. Pet. Eng. - ADIPEC, Society of Petroleum Engineers, 2024, ISBN: 978-195902549-8 (ISBN).
Abstract | Links | BibTeX | Tags: 'current, AI techniques, Augmented Reality, Decision making, Decisions makings, Efficiency, Finance, Fluid flow simulation, Fluid-flow, Gasoline, High-accuracy, Management tasks, Petroleum refining, Petroleum reservoir evaluation, Pore scale, Real- time, User interaction
@inproceedings{ansari_augmented_2024,
title = {Augmented Reality-Driven Reservoir Management Via Generative Ai: Transforming Pore-Scale Fluid Flow Simulation},
author = {U. Ansari and H. A. Qureshi and N. A. Soomro and A. R. Memon},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85215124881&doi=10.2118%2f222865-MS&partnerID=40&md5=32e8ddc777be342df8196b86a4eb7c60},
doi = {10.2118/222865-MS},
isbn = {978-195902549-8 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Soc. Pet. Eng. - ADIPEC},
publisher = {Society of Petroleum Engineers},
abstract = {The current revolution of generative artificial intelligence is transforming global dynamics which is also essential to petroleum engineers for effectively completing technical tasks.Henceforth the main aim of this study is to investigate the application of generative AI techniques for improving the efficiency of petroleum reservoir management.The outcomes of this study will help in developing and implementing generative AI algorithms tailored for reservoir management tasks, including reservoir modeling, production optimization, and decision support.In this study generative AI technique is employed to integrate with augmented reality (AR) to enhance reservoir management.The methodology involves developing a generative AI model to simulate pore-scale fluid flow, validated against experimental data.AR is utilized to visualize and interact with the simulation results in a real-time, immersive environment.The integration process includes data preprocessing, model training, and AR deployment.Performance metrics such as accuracy, computational efficiency, and user interaction quality are evaluated to assess the effectiveness of the proposed approach in transforming traditional reservoir management practices.The developed generative AI model demonstrated high accuracy in simulating pore-scale fluid flow, closely matching experimental data with a correlation coefficient of 0.95.The AR interface provided an intuitive visualization, significantly improving user comprehension and decision-making efficiency.Computational efficiency was enhanced by 40% compared to traditional methods, enabling real-time simulations and interactions.Moreover, it was observed that Users found the AR-driven approach more engaging and easier to understand, with a reported 30% increase in correct decision-making in reservoir management tasks.The integration of generative AI with AR allowed for dynamic adjustments and immediate feedback, which was particularly beneficial in complex scenarios requiring rapid analysis and response.Concludingly, the combination of generative AI and AR offers a transformative approach to reservoir management, enhancing both the accuracy of simulations and the effectiveness of user interactions.This methodology not only improves computational efficiency but also fosters better decision-making through immersive visualization.Future work will focus on refining the AI model and expanding the AR functionalities to cover a broader range of reservoir conditions and management strategies.This study introduces a novel integration of generative AI and augmented reality (AR) for reservoir management, offering a pioneering approach to pore-scale fluid flow simulation.By combining high-accuracy AI-driven simulations with real-time, immersive AR visualizations, this methodology significantly enhances user interaction and decision-making efficiency.This innovative framework transforms traditional practices, providing a more engaging, efficient, and accurate tool for managing complex reservoir systems. Copyright 2024, Society of Petroleum Engineers.},
keywords = {'current, AI techniques, Augmented Reality, Decision making, Decisions makings, Efficiency, Finance, Fluid flow simulation, Fluid-flow, Gasoline, High-accuracy, Management tasks, Petroleum refining, Petroleum reservoir evaluation, Pore scale, Real- time, User interaction},
pubstate = {published},
tppubtype = {inproceedings}
}
Weerasinghe, K.; Janapati, S.; Ge, X.; Kim, S.; Iyer, S.; Stankovic, J. A.; Alemzadeh, H.
Real-Time Multimodal Cognitive Assistant for Emergency Medical Services Proceedings Article
In: Proc. - ACM/IEEE Conf. Internet-of-Things Des. Implement., IoTDI, pp. 85–96, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835037025-6 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Augmented Reality, Cognitive Assistance, Computational Linguistics, Decision making, Domain knowledge, Edge computing, Emergency medical services, Forecasting, Graphic methods, Language Model, machine learning, Machine-learning, Multi-modal, Real- time, Service protocols, Smart Health, Speech recognition, State of the art
@inproceedings{weerasinghe_real-time_2024,
title = {Real-Time Multimodal Cognitive Assistant for Emergency Medical Services},
author = {K. Weerasinghe and S. Janapati and X. Ge and S. Kim and S. Iyer and J. A. Stankovic and H. Alemzadeh},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85197769304&doi=10.1109%2fIoTDI61053.2024.00012&partnerID=40&md5=a3b7cf14e46ecb2d4e49905fb845f2c9},
doi = {10.1109/IoTDI61053.2024.00012},
isbn = {979-835037025-6 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - ACM/IEEE Conf. Internet-of-Things Des. Implement., IoTDI},
pages = {85–96},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Emergency Medical Services (EMS) responders often operate under time-sensitive conditions, facing cognitive overload and inherent risks, requiring essential skills in critical thinking and rapid decision-making. This paper presents CognitiveEMS, an end-to-end wearable cognitive assistant system that can act as a collaborative virtual partner engaging in the real-time acquisition and analysis of multimodal data from an emergency scene and interacting with EMS responders through Augmented Reality (AR) smart glasses. CognitiveEMS processes the continuous streams of data in real-time and leverages edge computing to provide assistance in EMS protocol selection and intervention recognition. We address key technical challenges in real-time cognitive assistance by introducing three novel components: (i) a Speech Recognition model that is fine-tuned for real-world medical emergency conversations using simulated EMS audio recordings, augmented with synthetic data generated by large language models (LLMs); (ii) an EMS Protocol Prediction model that combines state-of-the-art (SOTA) tiny language models with EMS domain knowledge using graph-based attention mechanisms; (iii) an EMS Action Recognition module which leverages multimodal audio and video data and protocol predictions to infer the intervention/treatment actions taken by the responders at the incident scene. Our results show that for speech recognition we achieve superior performance compared to SOTA (WER of 0.290 vs. 0.618) on conversational data. Our protocol prediction component also significantly outperforms SOTA (top-3 accuracy of 0.800 vs. 0.200) and the action recognition achieves an accuracy of 0.727, while maintaining an end-to-end latency of 3.78s for protocol prediction on the edge and 0.31s on the server. © 2024 IEEE.},
keywords = {Artificial intelligence, Augmented Reality, Cognitive Assistance, Computational Linguistics, Decision making, Domain knowledge, Edge computing, Emergency medical services, Forecasting, Graphic methods, Language Model, machine learning, Machine-learning, Multi-modal, Real- time, Service protocols, Smart Health, Speech recognition, State of the art},
pubstate = {published},
tppubtype = {inproceedings}
}
Hart, A.; Shakir, M. Z.
Realtime AI Driven Environment Development for Virtual Metaverse Proceedings Article
In: IEEE Int. Conf. Metrol. Ext. Real., Artif. Intell. Neural Eng., MetroXRAINE - Proc., pp. 313–318, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835037800-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D models, 3d-modeling, AI in Metaverse Development, Artificial intelligence in metaverse development, Digital elevation model, Digital Innovation, Digital innovations, Metaverses, Real- time, Real-Time Adaptation, Scalable virtual world, Scalable Virtual Worlds, Unity Integration, Virtual environments, Virtual worlds
@inproceedings{hart_realtime_2024,
title = {Realtime AI Driven Environment Development for Virtual Metaverse},
author = {A. Hart and M. Z. Shakir},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85216090810&doi=10.1109%2fMetroXRAINE62247.2024.10796022&partnerID=40&md5=e339d3117291e480231b7bc32f117506},
doi = {10.1109/MetroXRAINE62247.2024.10796022},
isbn = {979-835037800-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {IEEE Int. Conf. Metrol. Ext. Real., Artif. Intell. Neural Eng., MetroXRAINE - Proc.},
pages = {313–318},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {The integration of Artificial Intelligence (AI) into the development of Metaverse environments denotes a noteworthy shift towards crafting virtual spaces with improved interactivity, immersion, and realism. This study looks to delve into the various roles AI plays in using 3D models, enriching experiences in virtual and augmented reality, to create scalable, dynamic virtual environments. It carefully examines the challenges related to computational demands, such as processing power and data storage, scalability issues, and ethical considerations concerning privacy and the misuse of AI -generated content. By exploring AI's application in game engine platforms such as Unity through research ongoing, this paper highlights the technical achievements and ever growing possibilities unlocked by AI, such as creating lifelike virtual environments. © 2024 IEEE.},
keywords = {3D modeling, 3D models, 3d-modeling, AI in Metaverse Development, Artificial intelligence in metaverse development, Digital elevation model, Digital Innovation, Digital innovations, Metaverses, Real- time, Real-Time Adaptation, Scalable virtual world, Scalable Virtual Worlds, Unity Integration, Virtual environments, Virtual worlds},
pubstate = {published},
tppubtype = {inproceedings}
}
Torre, F. De La; Fang, C. M.; Huang, H.; Banburski-Fahey, A.; Fernandez, J. A.; Lanier, J.
LLMR: Real-time Prompting of Interactive Worlds using Large Language Models Proceedings Article
In: Conf Hum Fact Comput Syst Proc, Association for Computing Machinery, 2024, ISBN: 979-840070330-0 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Computational Linguistics, Design goal, Interactive computer graphics, Interactive worlds, Internal dynamics, Language Model, Large language model, Mixed reality, Novel strategies, Real- time, Spatial Reasoning, Training data
@inproceedings{de_la_torre_llmr_2024,
title = {LLMR: Real-time Prompting of Interactive Worlds using Large Language Models},
author = {F. De La Torre and C. M. Fang and H. Huang and A. Banburski-Fahey and J. A. Fernandez and J. Lanier},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85194848276&doi=10.1145%2f3613904.3642579&partnerID=40&md5=14969e96507a1f0110262021e5b1172d},
doi = {10.1145/3613904.3642579},
isbn = {979-840070330-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Conf Hum Fact Comput Syst Proc},
publisher = {Association for Computing Machinery},
abstract = {We present Large Language Model for Mixed Reality (LLMR), a framework for the real-time creation and modification of interactive Mixed Reality experiences using LLMs. LLMR leverages novel strategies to tackle difficult cases where ideal training data is scarce, or where the design goal requires the synthesis of internal dynamics, intuitive analysis, or advanced interactivity. Our framework relies on text interaction and the Unity game engine. By incorporating techniques for scene understanding, task planning, self-debugging, and memory management, LLMR outperforms the standard GPT-4 by 4x in average error rate. We demonstrate LLMR's cross-platform interoperability with several example worlds, and evaluate it on a variety of creation and modification tasks to show that it can produce and edit diverse objects, tools, and scenes. Finally, we conducted a usability study (N=11) with a diverse set that revealed participants had positive experiences with the system and would use it again. © 2024 Copyright held by the owner/author(s)},
keywords = {Artificial intelligence, Computational Linguistics, Design goal, Interactive computer graphics, Interactive worlds, Internal dynamics, Language Model, Large language model, Mixed reality, Novel strategies, Real- time, Spatial Reasoning, Training data},
pubstate = {published},
tppubtype = {inproceedings}
}
Amato, N.; Carolis, B. De; Gioia, F.; Venezia, M. N.; Palestra, G.; Loglisci, C.
Can an AI-driven VTuber engage People? The KawAIi Case Study Proceedings Article
In: A., Soto; E., Zangerle (Ed.): CEUR Workshop Proc., CEUR-WS, 2024, ISBN: 16130073 (ISSN).
Abstract | Links | BibTeX | Tags: 3D Avatars, Case-studies, Conversational Agents, Facial Expressions, Language Model, Live streaming, LLM, LLMs, Real- time, Three dimensional computer graphics, Virtual agent, Virtual Reality, YouTube
@inproceedings{amato_can_2024,
title = {Can an AI-driven VTuber engage People? The KawAIi Case Study},
author = {N. Amato and B. De Carolis and F. Gioia and M. N. Venezia and G. Palestra and C. Loglisci},
editor = {Soto A. and Zangerle E.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85190754935&partnerID=40&md5=bd76d56b13e328027aa1b458849cf73f},
isbn = {16130073 (ISSN)},
year = {2024},
date = {2024-01-01},
booktitle = {CEUR Workshop Proc.},
volume = {3660},
publisher = {CEUR-WS},
abstract = {Live streaming has become increasingly popular, with most streamers presenting their real-life appearance. However, Virtual YouTubers (VTubers), virtual 2D or 3D avatars that are voiced by humans, are emerging as live streamers and attracting a growing viewership. This paper presents the development of a conversational agent, named KawAIi, embodied in a 2D character that, while accurately and promptly responding to user requests, provides an entertaining experience in streaming chat platforms such as YouTube while providing adequate real-time support. The agent relies on the Vicuna 7B GPTQ 4-bit Large Language Model (LLM). In addition, KawAIi uses a BERT-based model for analyzing the sentence generated by the model in terms of conveyed emotion and shows self-emotion awareness through facial expressions. Tested with users, the system has demonstrated a good ability to handle the interaction with the user while maintaining a pleasant user experience. In particular, KawAIi has been evaluated positively in terms of engagement and competence on various topics. The results show the potential of this technology to enrich interactivity in streaming platforms and offer a promising model for future online assistance contexts. © 2024 Copyright for this paper by its authors.},
keywords = {3D Avatars, Case-studies, Conversational Agents, Facial Expressions, Language Model, Live streaming, LLM, LLMs, Real- time, Three dimensional computer graphics, Virtual agent, Virtual Reality, YouTube},
pubstate = {published},
tppubtype = {inproceedings}
}
Imamura, S.; Hiraki, H.; Rekimoto, J.
Serendipity Wall: A Discussion Support System Using Real-Time Speech Recognition and Large Language Model Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 588–590, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835037449-0 (ISBN).
Abstract | Links | BibTeX | Tags: Brainstorming sessions, Discussion support, Embeddings, Group discussions, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Language Model, Large displays, Real- time, Speech recognition, Support systems, Virtual Reality
@inproceedings{imamura_serendipity_2024,
title = {Serendipity Wall: A Discussion Support System Using Real-Time Speech Recognition and Large Language Model},
author = {S. Imamura and H. Hiraki and J. Rekimoto},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85195557406&doi=10.1109%2fVRW62533.2024.00113&partnerID=40&md5=22c393aa1ea99a9e64d382f1b56fb877},
doi = {10.1109/VRW62533.2024.00113},
isbn = {979-835037449-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},
pages = {588–590},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Group discussions are important for exploring new ideas. One method to support discussions is presenting relevant keywords or images. However, the context of the conversation and information tended not to be taken into account. Therefore, we propose a system that develops group discussions by presenting related information in response to discussions. As a specific example, this study addressed academic discussions among HCI researchers. During brainstorming sessions, the system continuously transcribes the dialogue and generates embedding vectors of the discussions. These vectors are matched against those of existing research articles to identify relevant studies. Then, the system presented relevant studies on the large display with summarizing by an LLM. In a case study, this system had the effect of broadening the topics of discussion and facilitating the acquisition of new knowledge. A larger display area is desirable in terms of information volume and size. Therefore, in addition to large displays, virtual reality environments with headsets could be suitable for this system. © 2024 IEEE.},
keywords = {Brainstorming sessions, Discussion support, Embeddings, Group discussions, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Language Model, Large displays, Real- time, Speech recognition, Support systems, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Villalobos, W.; Kumar, Y.; Li, J. J.
The Multilingual Eyes Multimodal Traveler’s App Proceedings Article
In: X.-S., Yang; S., Sherratt; N., Dey; A., Joshi (Ed.): Lect. Notes Networks Syst., pp. 565–575, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 23673370 (ISSN); 978-981973304-0 (ISBN).
Abstract | Links | BibTeX | Tags: AI in travel, Artificial intelligence in travel, Assistive navigation technologies, Assistive navigation technology, Assistive navigations, Human-AI interaction in tourism, Human-artificial intelligence interaction in tourism, Language Model, Military applications, Military operations, Multi-modal, Multilingual translations, Multimodal large language model, Multimodal LLMs, Navigation technology, Real- time, Real-time multilingual translation, Robots, Virtual Reality
@inproceedings{villalobos_multilingual_2024,
title = {The Multilingual Eyes Multimodal Traveler’s App},
author = {W. Villalobos and Y. Kumar and J. J. Li},
editor = {Yang X.-S. and Sherratt S. and Dey N. and Joshi A.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85201104509&doi=10.1007%2f978-981-97-3305-7_45&partnerID=40&md5=91f94aa091c97ec3ad251e07b47fa06e},
doi = {10.1007/978-981-97-3305-7_45},
isbn = {23673370 (ISSN); 978-981973304-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Lect. Notes Networks Syst.},
volume = {1004 LNNS},
pages = {565–575},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {This paper presents an in-depth analysis of “The Multilingual Eyes Multimodal Traveler’s App” (MEMTA), a novel application in the realm of travel technology, leveraging advanced Artificial Intelligence (AI) capabilities. The core of MEMTA’s innovation lies in its integration of multimodal Large Language Models (LLMs), notably ChatGPT-4-Vision, to enhance navigational assistance and situational awareness for tourists and visually impaired individuals in diverse environments. The study rigorously evaluates how the incorporation of OpenAI’s Whisper and DALL-E 3 technologies augments the app’s proficiency in real-time, multilingual translation, pronunciation, and visual content generation, thereby significantly improving the user experience in various geographical settings. A key focus is placed on the development and impact of a custom GPT model, Susanin, designed specifically for the app, highlighting its advancements in Human-AI interaction and accessibility over standard LLMs. The paper thoroughly explores the practical applications of MEMTA, extending its utility beyond mere travel assistance to sectors such as robotics, virtual reality, and military operations, thus underscoring its multifaceted significance. Through this exploration, the study contributes novel insights into the fields of AI-enhanced travel, assistive technologies, and the broader scope of human-AI interaction. © The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2024.},
keywords = {AI in travel, Artificial intelligence in travel, Assistive navigation technologies, Assistive navigation technology, Assistive navigations, Human-AI interaction in tourism, Human-artificial intelligence interaction in tourism, Language Model, Military applications, Military operations, Multi-modal, Multilingual translations, Multimodal large language model, Multimodal LLMs, Navigation technology, Real- time, Real-time multilingual translation, Robots, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Liang, X.; Wang, Y.; Yan, F.; Ouyang, Z.; Hu, Y.; Luo, S.
Reborn of the White Bone Demon: Role-Playing Game Design Using Generative AI in XR Proceedings Article
In: S.N., Spencer (Ed.): Proc. - SIGGRAPH Asia Posters, SA, Association for Computing Machinery, Inc, 2024, ISBN: 979-840071138-1 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence techniques, Emotion Recognition, Game design, Gaming experiences, Real- time, Role-playing game, Speech emotion recognition, Speech enhancement, Speech recognition, Time based
@inproceedings{liang_reborn_2024,
title = {Reborn of the White Bone Demon: Role-Playing Game Design Using Generative AI in XR},
author = {X. Liang and Y. Wang and F. Yan and Z. Ouyang and Y. Hu and S. Luo},
editor = {Spencer S.N.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85215520655&doi=10.1145%2f3681756.3697949&partnerID=40&md5=a255cbdfd881f70df82341875f16d546},
doi = {10.1145/3681756.3697949},
isbn = {979-840071138-1 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - SIGGRAPH Asia Posters, SA},
publisher = {Association for Computing Machinery, Inc},
abstract = {This paper explores the application of Generative Artificial Intelligence (GenAI) techniques to the design of Role Playing Games (RPGs) in Extended Reality (XR) environments. We developed the game Reborn of the White Bone Demon, which utilizes AI speech emotion recognition technology to generate story lines and game assets in real-time based on the player's conversations with NPCs, enhancing the player's immersion and personalized experience, demonstrating the potential of GenAI in enhancing the XR gaming experience. © 2024 Copyright held by the owner/author(s).},
keywords = {Artificial intelligence techniques, Emotion Recognition, Game design, Gaming experiences, Real- time, Role-playing game, Speech emotion recognition, Speech enhancement, Speech recognition, Time based},
pubstate = {published},
tppubtype = {inproceedings}
}
Behravan, M.; Gracanin, D.
Generative Multi-Modal Artificial Intelligence for Dynamic Real-Time Context-Aware Content Creation in Augmented Reality Proceedings Article
In: S.N., Spencer (Ed.): Proc. ACM Symp. Virtual Reality Softw. Technol. VRST, Association for Computing Machinery, 2024, ISBN: 979-840070535-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D object, 3D Object Generation, Augmented Reality, Content creation, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Language Model, Multi-modal, Real- time, Time contexts, Vision language model, vision language models, Visual languages
@inproceedings{behravan_generative_2024,
title = {Generative Multi-Modal Artificial Intelligence for Dynamic Real-Time Context-Aware Content Creation in Augmented Reality},
author = {M. Behravan and D. Gracanin},
editor = {Spencer S.N.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85212524068&doi=10.1145%2f3641825.3689685&partnerID=40&md5=daf8aa8960d9dd4dbdbf67ccb1e7fb83},
doi = {10.1145/3641825.3689685},
isbn = {979-840070535-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. ACM Symp. Virtual Reality Softw. Technol. VRST},
publisher = {Association for Computing Machinery},
abstract = {We introduce a framework that uses generative Artificial Intelligence (AI) for dynamic and context-aware content creation in Augmented Reality (AR). By integrating Vision Language Models (VLMs), our system detects and understands the physical space around the user, recommending contextually relevant objects. These objects are transformed into 3D models using a text-to-3D generative AI techniques, allowing for real-time content inclusion within the AR space. This approach enhances user experience by enabling intuitive customization through spoken commands, while reducing costs and improving accessibility to advanced AR interactions. The framework's vision and language capabilities support the generation of comprehensive and context-specific 3D objects. © 2024 Owner/Author.},
keywords = {3D object, 3D Object Generation, Augmented Reality, Content creation, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Language Model, Multi-modal, Real- time, Time contexts, Vision language model, vision language models, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Klein, A.; Arnowitz, E.
AI in mixed reality - Copilot on HoloLens: Spatial computing with large language models Proceedings Article
In: S.N., Spencer (Ed.): Proc. - SIGGRAPH Real-Time Live!, Association for Computing Machinery, Inc, 2024, ISBN: 979-840070526-7 (ISBN).
Abstract | Links | BibTeX | Tags: 3D, AI, AR, Gesture, Gestures, HoloLens, Language Model, LLM, Mixed reality, Real- time, Real-time, Spatial computing, User experience design, User interfaces, Voice
@inproceedings{klein_ai_2024,
title = {AI in mixed reality - Copilot on HoloLens: Spatial computing with large language models},
author = {A. Klein and E. Arnowitz},
editor = {Spencer S.N.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85200657459&doi=10.1145%2f3641520.3665305&partnerID=40&md5=07d385771b8813c1fafa0efb7ae7e9f2},
doi = {10.1145/3641520.3665305},
isbn = {979-840070526-7 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - SIGGRAPH Real-Time Live!},
publisher = {Association for Computing Machinery, Inc},
abstract = {Mixed reality together with AI presents a human-first interface that promises to transform operations. Copilot can assist industrial workers in real-time with speech and holograms; generative AI is used to search technical documentation, service records, training content, and other sources. Copilot then summarizes to provide interactive guidance. © 2024 Owner/Author.},
keywords = {3D, AI, AR, Gesture, Gestures, HoloLens, Language Model, LLM, Mixed reality, Real- time, Real-time, Spatial computing, User experience design, User interfaces, Voice},
pubstate = {published},
tppubtype = {inproceedings}
}