AHCI RESEARCH GROUP

Publications

Papers published in international journals,
proceedings of conferences, workshops and books.

OUR RESEARCH

Scientific Publications

How to

Here you can find the complete list of our publications.
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.

Show all

29 entries « ‹ 1 of 2 › »

2025

Chen, J.; Wu, X.; Lan, T.; Li, B.

LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models Journal Article

In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2715–2724, 2025, ISSN: 10772626 (ISSN), (Publisher: IEEE Computer Society).

Abstract | Links | BibTeX | Tags: % reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality

@article{chen_llmer_2025,

title = {LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models},

author = {J. Chen and X. Wu and T. Lan and B. Li},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003825793&doi=10.1109%2FTVCG.2025.3549549&partnerID=40&md5=50597473616678390f143a33082a13d3},

doi = {10.1109/TVCG.2025.3549549},

issn = {10772626 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {IEEE Transactions on Visualization and Computer Graphics},

volume = {31},

number = {5},

pages = {2715–2724},

abstract = {The integration of Large Language Models (LLMs) like GPT-4 with Extended Reality (XR) technologies offers the potential to build truly immersive XR environments that interact with human users through natural language, e.g., generating and animating 3D scenes from audio inputs. However, the complexity of XR environments makes it difficult to accurately extract relevant contextual data and scene/object parameters from an overwhelming volume of XR artifacts. It leads to not only increased costs with pay-per-use models, but also elevated levels of generation errors. Moreover, existing approaches focusing on coding script generation are often prone to generation errors, resulting in flawed or invalid scripts, application crashes, and ultimately a degraded user experience. To overcome these challenges, we introduce LLMER, a novel framework that creates interactive XR worlds using JSON data generated by LLMs. Unlike prior approaches focusing on coding script generation, LLMER translates natural language inputs into JSON data, significantly reducing the likelihood of application crashes and processing latency. It employs a multi-stage strategy to supply only the essential contextual information adapted to the user's request and features multiple modules designed for various XR tasks. Our preliminary user study reveals the effectiveness of the proposed system, with over 80% reduction in consumed tokens and around 60% reduction in task completion time compared to state-of-the-art approaches. The analysis of users' feedback also illuminates a series of directions for further optimization. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: IEEE Computer Society},

keywords = {% reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality},

pubstate = {published},

tppubtype = {article}

}

Ding, S.; Chen, Y.

RAG-VR: Leveraging Retrieval-Augmented Generation for 3D Question Answering in VR Environments Proceedings Article

In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 131–136, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 9798331514846 (ISBN).

Abstract | Links | BibTeX | Tags: Ambient intelligence, Computational Linguistics, Computer interaction, Computing methodologies, Computing methodologies-Artificial intelligence-Natural language processing-Natural language generation, Computing methodology-artificial intelligence-natural language processing-natural language generation, Data handling, Formal languages, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Language Model, Language processing, Natural language generation, Natural language processing systems, Natural languages, Virtual Reality, Word processing

@inproceedings{ding_rag-vr_2025,

title = {RAG-VR: Leveraging Retrieval-Augmented Generation for 3D Question Answering in VR Environments},

author = {S. Ding and Y. Chen},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005140593&doi=10.1109%2FVRW66409.2025.00034&partnerID=40&md5=0bd7d96a9bf05f93d17850cd3b380ff4},

doi = {10.1109/VRW66409.2025.00034},

isbn = {9798331514846 (ISBN)},

year  = {2025},

date = {2025-01-01},

booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},

pages = {131–136},

publisher = {Institute of Electrical and Electronics Engineers Inc.},

abstract = {Recent advances in large language models (LLMs) provide new opportunities for context understanding in virtual reality (VR). However, VR contexts are often highly localized and personalized, limiting the effectiveness of general-purpose LLMs. To address this challenge, we present RAG-VR, the first 3D question-answering system for VR that incorporates retrieval-augmented generation (RAG), which augments an LLM with external knowledge retrieved from a localized knowledge database to improve the answer quality. RAG-VR includes a pipeline for extracting comprehensive knowledge about virtual environments and user conditions for accurate answer generation. To ensure efficient retrieval, RAG-VR offloads the retrieval process to a nearby edge server and uses only essential information during retrieval. Moreover, we train the retriever to effectively distinguish among relevant, irrelevant, and hard-to-differentiate information in relation to questions. RAG-VR improves answer accuracy by 17.9%-41.8% and reduces end-to-end latency by 34.5%-47.3% compared with two baseline systems. © 2025 Elsevier B.V., All rights reserved.},

keywords = {Ambient intelligence, Computational Linguistics, Computer interaction, Computing methodologies, Computing methodologies-Artificial intelligence-Natural language processing-Natural language generation, Computing methodology-artificial intelligence-natural language processing-natural language generation, Data handling, Formal languages, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Language Model, Language processing, Natural language generation, Natural language processing systems, Natural languages, Virtual Reality, Word processing},

pubstate = {published},

tppubtype = {inproceedings}

}

Mereu, J.

Using LLMs to enhance end-user development support in XR Proceedings Article

In: V., Paneva; D., Tetteroo; V., Frau; S., Feger; D., Spano; F., Paterno; S., Sauer; M., Manca (Ed.): CEUR Workshop Proc., CEUR-WS, 2025, ISBN: 16130073 (ISSN).

Abstract | Links | BibTeX | Tags: Artificial intelligence, Condition, Configuration, Development support, Development technique, End-User Development, End-Users, Event-condition-action, Event-Condition-Actions, Extended reality, Human computer interaction, Information Systems, Information use, Natural Language, Natural language processing systems, Natural languages, Rule, rules

Kurai, R.; Hiraki, T.; Hiroi, Y.; Hirao, Y.; Perusquía-Hernández, M.; Uchiyama, H.; Kiyokawa, K.

MagicCraft: Natural Language-Driven Generation of Dynamic and Interactive 3D Objects for Commercial Metaverse Platforms Journal Article

In: IEEE Access, vol. 13, pp. 132459–132474, 2025, ISSN: 21693536 (ISSN), (Publisher: Institute of Electrical and Electronics Engineers Inc.).

Abstract | Links | BibTeX | Tags: 3D models, 3D object, 3D Object Generation, 3d-modeling, AI-Assisted Design, Artificial intelligence, Behavioral Research, Content creation, Generative AI, Immersive, Metaverse, Metaverses, Natural language processing systems, Natural languages, Object oriented programming, Three dimensional computer graphics, user experience, User interfaces

@article{kurai_magiccraft_2025,

title = {MagicCraft: Natural Language-Driven Generation of Dynamic and Interactive 3D Objects for Commercial Metaverse Platforms},

author = {R. Kurai and T. Hiraki and Y. Hiroi and Y. Hirao and M. Perusquía-Hernández and H. Uchiyama and K. Kiyokawa},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105010187256&doi=10.1109%2FACCESS.2025.3587232&partnerID=40&md5=9b7a8115c62a8f9da4956dbbbb53dc4e},

doi = {10.1109/ACCESS.2025.3587232},

issn = {21693536 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {IEEE Access},

volume = {13},

pages = {132459–132474},

abstract = {Metaverse platforms are rapidly evolving to provide immersive spaces for user interaction and content creation. However, the generation of dynamic and interactive 3D objects remains challenging due to the need for advanced 3D modeling and programming skills. To address this challenge, we present MagicCraft, a system that generates functional 3D objects from natural language prompts for metaverse platforms. MagicCraft uses generative AI models to manage the entire content creation pipeline: converting user text descriptions into images, transforming images into 3D models, predicting object behavior, and assigning necessary attributes and scripts. It also provides an interactive interface for users to refine generated objects by adjusting features such as orientation, scale, seating positions, and grip points. Implemented on Cluster, a commercial metaverse platform, MagicCraft was evaluated by 7 expert CG designers and 51 general users. Results show that MagicCraft significantly reduces the time and skill required to create 3D objects. Users with no prior experience in 3D modeling or programming successfully created complex, interactive objects and deployed them in the metaverse. Expert feedback highlighted the system's potential to improve content creation workflows and support rapid prototyping. By integrating AI-generated content into metaverse platforms, MagicCraft makes 3D content creation more accessible. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: Institute of Electrical and Electronics Engineers Inc.},

keywords = {3D models, 3D object, 3D Object Generation, 3d-modeling, AI-Assisted Design, Artificial intelligence, Behavioral Research, Content creation, Generative AI, Immersive, Metaverse, Metaverses, Natural language processing systems, Natural languages, Object oriented programming, Three dimensional computer graphics, user experience, User interfaces},

pubstate = {published},

tppubtype = {article}

}

Mendoza, A. P.; Quiroga, K. J. Barrios; Celis, S. D. Solano; M., C. G. Quintero

NAIA: A Multi-Technology Virtual Assistant for Boosting Academic Environments—A Case Study Journal Article

In: IEEE Access, vol. 13, pp. 141461–141483, 2025, ISSN: 21693536 (ISSN), (Publisher: Institute of Electrical and Electronics Engineers Inc.).

Abstract | Links | BibTeX | Tags: Academic environment, Artificial intelligence, Case-studies, Computational Linguistics, Computer vision, Digital avatar, Digital avatars, Efficiency, Human computer interaction, Human-AI Interaction, Interactive computer graphics, Language Model, Large language model, large language model (LLM), Learning systems, Natural language processing systems, Personal digital assistants, Personnel training, Population statistics, Speech communication, Speech processing, Speech to text, speech to text (STT), Text to speech, text to speech (TTS), user experience, User interfaces, Virtual assistant, Virtual assistants, Virtual Reality

@article{mendoza_naia_2025,

title = {NAIA: A Multi-Technology Virtual Assistant for Boosting Academic Environments—A Case Study},

author = {A. P. Mendoza and K. J. Barrios Quiroga and S. D. Solano Celis and C. G. Quintero M.},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105013598763&doi=10.1109%2FACCESS.2025.3597565&partnerID=40&md5=7ad6b037cfedb943fc026642c4854284},

doi = {10.1109/ACCESS.2025.3597565},

issn = {21693536 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {IEEE Access},

volume = {13},

pages = {141461–141483},

abstract = {Virtual assistants have become essential tools for improving productivity and efficiency in various domains. This paper presents NAIA (Nimble Artificial Intelligence Assistant), an advanced multi-role and multi-task virtual assistant enhanced with artificial intelligence, designed to serve a university community case study. The system integrates AI technologies including Large Language Models (LLM), Computer Vision, and voice processing to create an immersive and efficient interaction through animated digital avatars. NAIA features five specialized roles: researcher, receptionist, personal skills trainer, personal assistant, and university guide, each equipped with specific capabilities to support different aspects of academic life. The system’s Computer Vision capabilities enable it to comment on users’ physical appearance and environment, enriching the interaction. Through natural language processing and voice interaction, NAIA aims to improve productivity and efficiency within the university environment while providing personalized assistance through a ubiquitous platform accessible across multiple devices. NAIA is evaluated through a user experience survey involving 30 participants with different demographic characteristics, this is the most accepted way by the community to evaluate this type of solution. Participants give their feedback after using one role of NAIA after using it for 30 minutes. The experiment showed that 90% of the participants considered NAIA-assisted tasks of higher quality and, on average, NAIA has a score of 4.27 out of 5 on user satisfaction. Participants particularly appreciated the assistant’s visual recognition, natural conversation flow, and user interaction capabilities. Results demonstrate NAIA’s capabilities and effectiveness across the five roles. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: Institute of Electrical and Electronics Engineers Inc.},

keywords = {Academic environment, Artificial intelligence, Case-studies, Computational Linguistics, Computer vision, Digital avatar, Digital avatars, Efficiency, Human computer interaction, Human-AI Interaction, Interactive computer graphics, Language Model, Large language model, large language model (LLM), Learning systems, Natural language processing systems, Personal digital assistants, Personnel training, Population statistics, Speech communication, Speech processing, Speech to text, speech to text (STT), Text to speech, text to speech (TTS), user experience, User interfaces, Virtual assistant, Virtual assistants, Virtual Reality},

pubstate = {published},

tppubtype = {article}

}

Paterakis, I.; Manoudaki, N.

Osmosis: Generative AI and XR for the real-time transformation of urban architectural environments Journal Article

In: International Journal of Architectural Computing, 2025, ISSN: 14780771 (ISSN), (Publisher: SAGE Publications Inc.).

Abstract | Links | BibTeX | Tags: Architectural design, Architectural environment, Artificial intelligence, Biodigital design, Case-studies, Computational architecture, Computer architecture, Extended reality, generative artificial intelligence, Immersive, Immersive environment, immersive environments, Natural language processing systems, Real- time, Urban environments, urban planning

@article{paterakis_osmosis_2025,

title = {Osmosis: Generative AI and XR for the real-time transformation of urban architectural environments},

author = {I. Paterakis and N. Manoudaki},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105014516125&doi=10.1177%2F14780771251356526&partnerID=40&md5=4bbcb09440d91899cb7d2d5d0c852507},

doi = {10.1177/14780771251356526},

issn = {14780771 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {International Journal of Architectural Computing},

abstract = {This work contributes to the evolving discourse on biodigital architecture by examining how generative artificial intelligence (AI) and extended reality (XR) systems can be combined to create immersive urban environments. Focusing on the case study of “Osmosis”, a series of large-scale public installations, this work proposes a methodological framework for real-time architectural composition in XR using diffusion models and interaction. The project reframes the architectural façade as a semi permeable membrane, through which digital content diffuses in response to environmental and user inputs. By integrating natural language prompts, multimodal input, and AI-generated visual synthesis with projection mapping, Osmosis advances a vision for urban architecture that is interactive, data-driven, and sensorially rich. The work explores new design territories where stochastic form-making and real-time responsiveness intersect, and positions AI as an augmentation of architectural creativity rather than its replacement. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: SAGE Publications Inc.},

keywords = {Architectural design, Architectural environment, Artificial intelligence, Biodigital design, Case-studies, Computational architecture, Computer architecture, Extended reality, generative artificial intelligence, Immersive, Immersive environment, immersive environments, Natural language processing systems, Real- time, Urban environments, urban planning},

pubstate = {published},

tppubtype = {article}

}

Boubakri, F. -E.; Kadri, M.; Kaghat, F. Z.; Azough, A.; Tairi, H.

Exploring 3D Cardiac Anatomy with Text-Based AI Guidance in Virtual Reality Proceedings Article

In: pp. 43–48, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 9798331534899 (ISBN).

Abstract | Links | BibTeX | Tags: 3D cardiac anatomy, 3d heart models, Anatomy education, Anatomy educations, Cardiac anatomy, Collaborative environments, Collaborative learning, Computer aided instruction, Curricula, Design and Development, E-Learning, Education computing, Generative AI, Heart, Immersive environment, Learning systems, Natural language processing systems, Social virtual reality, Students, Teaching, Three dimensional computer graphics, Virtual Reality

@inproceedings{boubakri_exploring_2025,

title = {Exploring 3D Cardiac Anatomy with Text-Based AI Guidance in Virtual Reality},

author = {F. -E. Boubakri and M. Kadri and F. Z. Kaghat and A. Azough and H. Tairi},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105015676741&doi=10.1109%2FSCME62582.2025.11104869&partnerID=40&md5=c961694f97c50adc23b6826dddb265cd},

doi = {10.1109/SCME62582.2025.11104869},

isbn = {9798331534899 (ISBN)},

year  = {2025},

date = {2025-01-01},

pages = {43–48},

publisher = {Institute of Electrical and Electronics Engineers Inc.},

abstract = {This paper presents the design and development of a social virtual reality (VR) classroom focused on cardiac anatomy education for students in grades K-12. The application allows multiple learners to explore a detailed 3D heart model within an immersive and collaborative environment. A crucial part of the system is the integration of a text-based conversational AI interface powered by ChatGPT, which provides immediate, interactive explanations and addresses student inquiries about heart anatomy. The system supports both guided and exploratory learning modes, encourages peer collaboration, and offers personalized support through natural language dialogue. We evaluated the system's effectiveness through a comprehensive study measuring learning perception (LPQ), VR perception (VRPQ), AI perception (AIPQ), and VR-related symptoms (VRSQ). Potential applications include making high-quality cardiac anatomy education more affordable for K-12 schools with limited resources, offering an adaptable AI-based tutoring system for students to learn at their own pace, and equipping educators with an easy-to-use tool to integrate into their science curriculum with minimal additional training. © 2025 Elsevier B.V., All rights reserved.},

keywords = {3D cardiac anatomy, 3d heart models, Anatomy education, Anatomy educations, Cardiac anatomy, Collaborative environments, Collaborative learning, Computer aided instruction, Curricula, Design and Development, E-Learning, Education computing, Generative AI, Heart, Immersive environment, Learning systems, Natural language processing systems, Social virtual reality, Students, Teaching, Three dimensional computer graphics, Virtual Reality},

pubstate = {published},

tppubtype = {inproceedings}

}

Zhao, P.; Wei, X.

The Role of 3D Virtual Humans in Communication and Assisting Students' Learning in Transparent Display Environments: Perspectives of Pre-Service Teachers Proceedings Article

In: Chui, K. T.; Jaikaeo, C.; Niramitranon, J.; Kaewmanee, W.; Ng, K. -K.; Ongkunaruk, P. (Ed.): pp. 319–323, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 9798331595500 (ISBN).

Abstract | Links | BibTeX | Tags: 3D virtual human, Assistive technology, CDIO teaching model, Collaborative learning, Collaborative practices, Display environments, E-Learning, Educational Technology, Engineering education, feedback, Integration, Knowledge delivery, Knowledge transfer, Learning algorithms, Natural language processing systems, Preservice teachers, Psychology computing, Student learning, Students, Teaching, Teaching model, Transparent display environment, Transparent displays, Virtual Reality

@inproceedings{zhao_role_2025,

title = {The Role of 3D Virtual Humans in Communication and Assisting Students' Learning in Transparent Display Environments: Perspectives of Pre-Service Teachers},

author = {P. Zhao and X. Wei},

editor = {K. T. Chui and C. Jaikaeo and J. Niramitranon and W. Kaewmanee and K. -K. Ng and P. Ongkunaruk},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105015746241&doi=10.1109%2FISET65607.2025.00069&partnerID=40&md5=08c39b84fa6bd6ac13ddbed203d7b1d9},

doi = {10.1109/ISET65607.2025.00069},

isbn = {9798331595500 (ISBN)},

year  = {2025},

date = {2025-01-01},

pages = {319–323},

publisher = {Institute of Electrical and Electronics Engineers Inc.},

abstract = {The integration of transparent display and 3D virtual human technologies into education is expanding rapidly; however, their systematic incorporation into the CDIO teaching model remains underexplored, particularly in supporting complex knowledge delivery and collaborative practice. This study developed an intelligent virtual teacher assistance system based on generative AI and conducted a teaching experiment combining transparent display and 3D virtual human technologies. Feedback was collected through focus group interviews with 24 pre-service teachers. Results show that the virtual human, through natural language and multimodal interaction, significantly enhanced classroom engagement and contextual understanding, while its real-time feedback and personalized guidance effectively supported CDIO-based collaborative learning. Nonetheless, challenges remain in contextual adaptability and emotional feedback accuracy. Accordingly, the study proposes a path for technical optimization through the integration of multimodal emotion recognition, adaptive instructional algorithms, and nonintrusive data collection, offering empirical and theoretical insights into educational technology integration within the CDIO framework and future intelligent learning tools. © 2025 Elsevier B.V., All rights reserved.},

keywords = {3D virtual human, Assistive technology, CDIO teaching model, Collaborative learning, Collaborative practices, Display environments, E-Learning, Educational Technology, Engineering education, feedback, Integration, Knowledge delivery, Knowledge transfer, Learning algorithms, Natural language processing systems, Preservice teachers, Psychology computing, Student learning, Students, Teaching, Teaching model, Transparent display environment, Transparent displays, Virtual Reality},

pubstate = {published},

tppubtype = {inproceedings}

}

Huang, D.; Ge, M.; Xiang, K.; Zhang, X.; Yang, H.

Privacy Preservation of Large Language Models in the Metaverse Era: Research Frontiers, Categorical Comparisons, and Future Directions Journal Article

In: International Journal of Network Management, vol. 35, no. 1, 2025, ISSN: 10557148 (ISSN); 10991190 (ISSN), (Publisher: John Wiley and Sons Ltd).

Abstract | Links | BibTeX | Tags: Adversarial networks, Computational Linguistics, Cryptography, Differential privacies, Excel, Language Model, Large language model, large language models, Life cycle, Metaverse, Metaverses, Natural language processing systems, Natural languages, Privacy preservation, Privacy protection, Research frontiers

@article{huang_privacy_2025,

title = {Privacy Preservation of Large Language Models in the Metaverse Era: Research Frontiers, Categorical Comparisons, and Future Directions},

author = {D. Huang and M. Ge and K. Xiang and X. Zhang and H. Yang},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85199980257&doi=10.1002%2Fnem.2292&partnerID=40&md5=55662aeedfb216784f0ed398cf8bd2f0},

doi = {10.1002/nem.2292},

issn = {10557148 (ISSN); 10991190 (ISSN)},

year  = {2025},

date = {2025-01-01},

booktitle = {Int J Network Manage},

journal = {International Journal of Network Management},

volume = {35},

number = {1},

publisher = {John Wiley and Sons Ltd},

abstract = {Large language models (LLMs), with their billions to trillions of parameters, excel in natural language processing, machine translation, dialog systems, and text summarization. These capabilities are increasingly pivotal in the metaverse, where they can enhance virtual interactions and environments. However, their extensive use, particularly in the metaverse's immersive platforms, raises significant privacy concerns. This paper analyzes existing privacy issues in LLMs, vital for both traditional and metaverse applications, and examines protection techniques across the entire life cycle of these models, from training to user deployment. We delve into cryptography, embedding layer encoding, differential privacy and its variants, and adversarial networks, highlighting their relevance in the metaverse context. Specifically, we explore technologies like homomorphic encryption and secure multiparty computation, which are essential for metaverse security. Our discussion on Gaussian differential privacy, Renyi differential privacy, Edgeworth accounting, and the generation of adversarial samples and loss functions emphasizes their importance in the metaverse's dynamic and interactive environments. Lastly, the paper discusses the current research status and future challenges in the security of LLMs within and beyond the metaverse, emphasizing urgent problems and potential areas for exploration. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: John Wiley and Sons Ltd},

keywords = {Adversarial networks, Computational Linguistics, Cryptography, Differential privacies, Excel, Language Model, Large language model, large language models, Life cycle, Metaverse, Metaverses, Natural language processing systems, Natural languages, Privacy preservation, Privacy protection, Research frontiers},

pubstate = {published},

tppubtype = {article}

}

Kai, W. -H.; Xing, K. -X.

Video-driven musical composition using large language model with memory-augmented state space Journal Article

In: Visual Computer, vol. 41, no. 5, pp. 3345–3357, 2025, ISSN: 01782789 (ISSN); 14322315 (ISSN), (Publisher: Springer Science and Business Media Deutschland GmbH).

Abstract | Links | BibTeX | Tags: 'current, Associative storage, Augmented Reality, Augmented state space, Computer simulation languages, Computer system recovery, Distributed computer systems, HTTP, Language Model, Large language model, Long-term video-to-music generation, Mamba, Memory architecture, Memory-augmented, Modeling languages, Music, Musical composition, Natural language processing systems, Object oriented programming, Performance, Problem oriented languages, State space, State-space

@article{kai_video-driven_2025,

title = {Video-driven musical composition using large language model with memory-augmented state space},

author = {W. -H. Kai and K. -X. Xing},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001073242&doi=10.1007%2Fs00371-024-03606-w&partnerID=40&md5=71a40ea7584c5a5f210afc1c30aac468},

doi = {10.1007/s00371-024-03606-w},

issn = {01782789 (ISSN); 14322315 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {Visual Computer},

volume = {41},

number = {5},

pages = {3345–3357},

abstract = {The current landscape of research leveraging large language models (LLMs) is experiencing a surge. Many works harness the powerful reasoning capabilities of these models to comprehend various modalities, such as text, speech, images, videos, etc. However, the research work on LLms for music inspiration is still in its infancy. To fill the gap in this field and break through the dilemma that LLMs can only understand short videos with limited frames, we propose a large language model with state space for long-term video-to-music generation. To capture long-range dependency and maintaining high performance, while further decrease the computing cost, our overall network includes the Enhanced Video Mamba, which incorporates continuous moving window partitioning and local feature augmentation, and a long-term memory bank that captures and aggregates historical video information to mitigate information loss in long sequences. This framework achieves both subquadratic-time computation and near-linear memory complexity, enabling effective long-term video-to-music generation. We conduct a thorough evaluation of our proposed framework. The experimental results demonstrate that our model achieves or surpasses the performance of the current state-of-the-art models. Our code released on https://github.com/kai211233/S2L2-V2M. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: Springer Science and Business Media Deutschland GmbH},

keywords = {'current, Associative storage, Augmented Reality, Augmented state space, Computer simulation languages, Computer system recovery, Distributed computer systems, HTTP, Language Model, Large language model, Long-term video-to-music generation, Mamba, Memory architecture, Memory-augmented, Modeling languages, Music, Musical composition, Natural language processing systems, Object oriented programming, Performance, Problem oriented languages, State space, State-space},

pubstate = {published},

tppubtype = {article}

}

Guo, P.; Zhang, Q.; Tian, C.; Xue, W.; Feng, X.

Digital Human Techniques for Education Reform Proceedings Article

In: ICETM - Proc. Int. Conf. Educ. Technol. Manag., pp. 173–178, Association for Computing Machinery, Inc, 2025, ISBN: 9798400717468 (ISBN).

Abstract | Links | BibTeX | Tags: Augmented Reality, Contrastive Learning, Digital elevation model, Digital human technique, Digital Human Techniques, Digital humans, Education Reform, Education reforms, Educational Technology, Express emotions, Federated learning, Human behaviors, Human form models, Human techniques, Immersive, Innovative technology, Modeling languages, Natural language processing systems, Teachers', Teaching, Virtual environments, Virtual humans

@inproceedings{guo_digital_2025,

title = {Digital Human Techniques for Education Reform},

author = {P. Guo and Q. Zhang and C. Tian and W. Xue and X. Feng},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001671326&doi=10.1145%2F3711403.3711428&partnerID=40&md5=fe9030a088666939b363c6c8c2fc5f66},

doi = {10.1145/3711403.3711428},

isbn = {9798400717468 (ISBN)},

year  = {2025},

date = {2025-01-01},

booktitle = {ICETM - Proc. Int. Conf. Educ. Technol. Manag.},

pages = {173–178},

publisher = {Association for Computing Machinery, Inc},

abstract = {The rapid evolution of artificial intelligence, big data, and generative AI models has ushered in significant transformations across various sectors, including education. Digital Human Technique, an innovative technology grounded in advanced computer science and artificial intelligence, is reshaping educational paradigms by enabling virtual humans to simulate human behavior, express emotions, and interact with users. This paper explores the application of Digital Human Technique in education reform, focusing on creating immersive, intelligent classroom experiences that foster meaningful interactions between teachers and students. We define Digital Human Technique and delve into its key technical components such as character modeling and rendering, natural language processing, computer vision, and augmented reality technologies. Our methodology involves analyzing the role of educational digital humans created through these technologies, assessing their impact on educational processes, and examining various application scenarios in educational reform. Results indicate that Digital Human Technique significantly enhances the learning experience by enabling personalized teaching, increasing engagement, and fostering emotional connections. Educational digital humans serve as virtual teachers, interactive learning aids, and facilitators of emotional interaction, effectively addressing the challenges of traditional educational methods. They also promote a deeper understanding of complex concepts through simulated environments and interactive digital content. © 2025 Elsevier B.V., All rights reserved.},

keywords = {Augmented Reality, Contrastive Learning, Digital elevation model, Digital human technique, Digital Human Techniques, Digital humans, Education Reform, Education reforms, Educational Technology, Express emotions, Federated learning, Human behaviors, Human form models, Human techniques, Immersive, Innovative technology, Modeling languages, Natural language processing systems, Teachers', Teaching, Virtual environments, Virtual humans},

pubstate = {published},

tppubtype = {inproceedings}

}

Dong, W.; Li, S.; Zheng, P.; Liu, L.; Chen, S.

A 3DGS and LLM-based physical-to-virtual approach for human-robot interactive manufacturing Journal Article

In: Manufacturing Letters, vol. 44, pp. 121–128, 2025, ISSN: 22138463 (ISSN), (Publisher: Elsevier Ltd).

Abstract | Links | BibTeX | Tags: 3D modeling, Gaussian distribution, Gaussians, High level languages, Human computer interaction, Human Robot Interaction, Human robots, Humans-robot interactions, Industrial robots, Language Model, Large language model, Man machine systems, Metaverses, Model-based OPC, Natural language processing systems, Physical-to-virtual, Robot programming, Robotic assembly, Splatting, Three dimensional computer graphics, Three-dimensional gaussian splatting

@article{dong_3dgs_2025,

title = {A 3DGS and LLM-based physical-to-virtual approach for human-robot interactive manufacturing},

author = {W. Dong and S. Li and P. Zheng and L. Liu and S. Chen},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105014947667&doi=10.1016%2Fj.mfglet.2025.06.016&partnerID=40&md5=8fd8b07c1f2c71e46b396d2e244bf701},

doi = {10.1016/j.mfglet.2025.06.016},

issn = {22138463 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {Manufacturing Letters},

volume = {44},

pages = {121–128},

abstract = {With the exploration of digital transformation in the industry, the introduction of the industrial metaverse is bringing unprecedented opportunities and challenges to the manufacturing industry. In the industrial metaverse, humans can interact safely and naturally with robots in high-fidelity digital environments, enabling non-technical operators to quickly validate industrial scenarios and help optimize decision-making and production processes. However, the complexity of Three-Dimensional (3D) modeling poses a challenge to achieving this goal. Additionally, programming-based Human Robot Interaction (HRI) also presents obstacles, as operators need significant time to learn how to control robots. Therefore, this paper proposes a 3D Gaussian Splatting (3DGS) and Large Language Model (LLM)-based physical-to-virtual approach for human-robot interactive manufacturing, which further facilitates digital interaction for non-technical operators in manufacturing environments. Specifically, 3DGS is first used for rapid visualization and reconstruction of the overall scene, achieving new perspective rendering and providing a gaussian ellipsoid representation. Then mesh extraction algorithms based on gaussian representation are used to build a physical-to-virtual transfer framework. Finally, LLM is utilized for understanding natural language commands and generating virtual robot Python programming to complete robot assembly tasks. This framework is implemented in the Isaac Sim simulator, and the case study shows that the proposed framework can quickly and accurately complete physical-to-virtual transfer and accomplish robot assembly manufacturing tasks in the simulator with low code. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: Elsevier Ltd},

keywords = {3D modeling, Gaussian distribution, Gaussians, High level languages, Human computer interaction, Human Robot Interaction, Human robots, Humans-robot interactions, Industrial robots, Language Model, Large language model, Man machine systems, Metaverses, Model-based OPC, Natural language processing systems, Physical-to-virtual, Robot programming, Robotic assembly, Splatting, Three dimensional computer graphics, Three-dimensional gaussian splatting},

pubstate = {published},

tppubtype = {article}

}

Xi, Z.; Yao, Z.; Huang, J.; Lu, Z. -Q.; Yan, H.; Mu, T. -J.; Wang, Z.; Xu, Q. -C.

TerraCraft: City-scale generative procedural modeling with natural languages Journal Article

In: Graphical Models, vol. 141, 2025, ISSN: 15240703 (ISSN), (Publisher: Elsevier Inc.).

Abstract | Links | BibTeX | Tags: 3D scene generation, 3D scenes, algorithm, Automation, City layout, City scale, data set, Diffusion Model, Game design, Geometry, High quality, Language, Language Model, Large datasets, Large language model, LLMs, Modeling languages, Natural language processing systems, Procedural modeling, Procedural models, Scene Generation, Three dimensional computer graphics, three-dimensional modeling, urban area, Virtual Reality

@article{xi_terracraft_2025,

title = {TerraCraft: City-scale generative procedural modeling with natural languages},

author = {Z. Xi and Z. Yao and J. Huang and Z. -Q. Lu and H. Yan and T. -J. Mu and Z. Wang and Q. -C. Xu},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105012397682&doi=10.1016%2Fj.gmod.2025.101285&partnerID=40&md5=15a84050280e5015b1f7b1ef40c62100},

doi = {10.1016/j.gmod.2025.101285},

issn = {15240703 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {Graphical Models},

volume = {141},

abstract = {Automated generation of large-scale 3D scenes presents a significant challenge due to the resource-intensive training and datasets required. This is in sharp contrast to the 2D counterparts that have become readily available due to their superior speed and quality. However, prior work in 3D procedural modeling has demonstrated promise in generating high-quality assets using the combination of algorithms and user-defined rules. To leverage the best of both 2D generative models and procedural modeling tools, we present TerraCraft, a novel framework for generating geometrically high-quality 3D city-scale scenes. By utilizing Large Language Models (LLMs), TerraCraft can generate city-scale 3D scenes from natural text descriptions. With its intuitive operation and powerful capabilities, TerraCraft enables users to easily create geometrically high-quality scenes readily for various applications, such as virtual reality and game design. We validate TerraCraft's effectiveness through extensive experiments and user studies, showing its superior performance compared to existing baselines. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: Elsevier Inc.},

keywords = {3D scene generation, 3D scenes, algorithm, Automation, City layout, City scale, data set, Diffusion Model, Game design, Geometry, High quality, Language, Language Model, Large datasets, Large language model, LLMs, Modeling languages, Natural language processing systems, Procedural modeling, Procedural models, Scene Generation, Three dimensional computer graphics, three-dimensional modeling, urban area, Virtual Reality},

pubstate = {published},

tppubtype = {article}

}

B, C. E. Pardo; R, O. I. Iglesias; A, M. D. León; M., C. G. Quintero

EverydAI: Virtual Assistant for Decision-Making in Daily Contexts, Powered by Artificial Intelligence Journal Article

In: Systems, vol. 13, no. 9, 2025, ISSN: 20798954 (ISSN), (Publisher: Multidisciplinary Digital Publishing Institute (MDPI)).

Abstract | Links | BibTeX | Tags: Artificial intelligence, Augmented Reality, Behavioral Research, Decision making, Decisions makings, Digital avatar, Digital avatars, Information overloads, Informed decision, Interactive computer graphics, Language Model, Large language model, large language models, Natural language processing systems, Natural languages, Object Detection, Object recognition, Objects detection, recommendation systems, Recommender systems, Three dimensional computer graphics, Virtual assistants, Virtual Reality, web scraping, Web scrapings

@article{pardo_b_everydai_2025,

title = {EverydAI: Virtual Assistant for Decision-Making in Daily Contexts, Powered by Artificial Intelligence},

author = {C. E. Pardo B and O. I. Iglesias R and M. D. León A and C. G. Quintero M.},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105017115803&doi=10.3390%2Fsystems13090753&partnerID=40&md5=475327fffcdc43ee3466b4a65111866a},

doi = {10.3390/systems13090753},

issn = {20798954 (ISSN)},

year  = {2025},

date = {2025-01-01},

journal = {Systems},

volume = {13},

number = {9},

abstract = {In an era of information overload, artificial intelligence plays a pivotal role in supporting everyday decision-making. This paper introduces EverydAI, a virtual AI-powered assistant designed to help users make informed decisions across various daily domains such as cooking, fashion, and fitness. By integrating advanced natural language processing, object detection, augmented reality, contextual understanding, digital 3D avatar models, web scraping, and image generation, EverydAI delivers personalized recommendations and insights tailored to individual needs. The proposed framework addresses challenges related to decision fatigue and information overload by combining real-time object detection and web scraping to enhance the relevance and reliability of its suggestions. EverydAI is evaluated through a two-phase survey, each one involving 30 participants with diverse demographic backgrounds. Results indicate that on average, 92.7% of users agreed or strongly agreed with statements reflecting the system’s usefulness, ease of use, and overall performance, indicating a high level of acceptance and perceived effectiveness. Additionally, EverydAI received an average user satisfaction score of 4.53 out of 5, underscoring its effectiveness in supporting users’ daily routines. © 2025 Elsevier B.V., All rights reserved.},

note = {Publisher: Multidisciplinary Digital Publishing Institute (MDPI)},

keywords = {Artificial intelligence, Augmented Reality, Behavioral Research, Decision making, Decisions makings, Digital avatar, Digital avatars, Information overloads, Informed decision, Interactive computer graphics, Language Model, Large language model, large language models, Natural language processing systems, Natural languages, Object Detection, Object recognition, Objects detection, recommendation systems, Recommender systems, Three dimensional computer graphics, Virtual assistants, Virtual Reality, web scraping, Web scrapings},

pubstate = {published},

tppubtype = {article}

}

2024

Cronin, I.

Understanding Generative AI Business Applications: A Guide to Technical Principles and Real-World Applications Book

Apress Media LLC, 2024, ISBN: 979-886880282-9 (ISBN); 979-886880281-2 (ISBN).

Abstract | Links | BibTeX | Tags: Artificial intelligence, Augmented Reality, Autonomous system, Autonomous systems, Business applications, Computer vision, Decision making, Gaussian Splatting, Gaussians, Generative AI, Language processing, Learning algorithms, Learning systems, machine learning, Machine-learning, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Splatting

@book{cronin_understanding_2024,

title = {Understanding Generative AI Business Applications: A Guide to Technical Principles and Real-World Applications},

author = {I. Cronin},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001777571&doi=10.1007%2f979-8-8688-0282-9&partnerID=40&md5=c0714ff3e1ad755596426ea092b830d6},

doi = {10.1007/979-8-8688-0282-9},

isbn = {979-886880282-9 (ISBN); 979-886880281-2 (ISBN)},

year  = {2024},

date = {2024-01-01},

publisher = {Apress Media LLC},

series = {Understanding Generative AI Business Applications: A Guide to Technical Principles and Real-World Applications},

abstract = {This guide covers the fundamental technical principles and various business applications of Generative AI for planning, developing, and evaluating AI-driven products. It equips you with the knowledge you need to harness the potential of Generative AI for enhancing business creativity and productivity. The book is organized into three sections: text-based, senses-based, and rationale-based. Each section provides an in-depth exploration of the specific methods and applications of Generative AI. In the text-based section, you will find detailed discussions on designing algorithms to automate and enhance written communication, including insights into the technical aspects of transformer-based Natural Language Processing (NLP) and chatbot architecture, such as GPT-4, Claude 2, Google Bard, and others. The senses-based section offers a glimpse into the algorithms and data structures that underpin visual, auditory, and multisensory experiences, including NeRF, 3D Gaussian Splatting, Stable Diffusion, AR and VR technologies, and more. The rationale-based section illuminates the decision-making capabilities of AI, with a focus on machine learning and data analytics techniques that empower applications such as simulation models, agents, and autonomous systems. In summary, this book serves as a guide for those seeking to navigate the dynamic landscape of Generative AI. Whether you’re a seasoned AI professional or a business leader looking to harness the power of creative automation, these pages offer a roadmap to leverage Generative AI for your organization’s success. © 2024 by Irena Cronin.},

keywords = {Artificial intelligence, Augmented Reality, Autonomous system, Autonomous systems, Business applications, Computer vision, Decision making, Gaussian Splatting, Gaussians, Generative AI, Language processing, Learning algorithms, Learning systems, machine learning, Machine-learning, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Splatting},

pubstate = {published},

tppubtype = {book}

}

Clocchiatti, A.; Fumero, N.; Soccini, A. M.

Character Animation Pipeline based on Latent Diffusion and Large Language Models Proceedings Article

In: Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR, pp. 398–405, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 9798350372021 (ISBN).

Abstract | Links | BibTeX | Tags: Animation, Animation pipeline, Artificial intelligence, Augmented Reality, Character animation, Computational Linguistics, Computer animation, Deep learning, Diffusion, E-Learning, Extended reality, Film production, Generative art, Language Model, Learning systems, Learning techniques, Natural language processing systems, Pipelines, Production pipelines, Virtual Reality

@inproceedings{clocchiatti_character_2024,

title = {Character Animation Pipeline based on Latent Diffusion and Large Language Models},

author = {A. Clocchiatti and N. Fumero and A. M. Soccini},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85187217072&doi=10.1109%2FAIxVR59861.2024.00067&partnerID=40&md5=c51a20d28df6b65ef2587a75aadafae4},

doi = {10.1109/AIxVR59861.2024.00067},

isbn = {9798350372021 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR},

pages = {398–405},

publisher = {Institute of Electrical and Electronics Engineers Inc.},

abstract = {Artificial intelligence and deep learning techniques are revolutionizing the film production pipeline. The majority of the current screenplay-to-animation pipelines focus on understanding the screenplay through natural language processing techniques, and on the generation of the animation through custom engines, missing the possibility to customize the characters. To address these issues, we propose a high-level pipeline for generating 2D characters and animations starting from screenplays, through a combination of Latent Diffusion Models and Large Language Models. Our approach uses ChatGPT to generate character descriptions starting from the screenplay. Then, using that data, it generates images of custom characters with Stable Diffusion and animates them according to their actions in different scenes. The proposed approach avoids well-known problems in generative AI tools such as temporal inconsistency and lack of control on the outcome. The results suggest that the pipeline is consistent and reliable, benefiting industries ranging from film production to virtual, augmented and extended reality content creation. © 2024 Elsevier B.V., All rights reserved.},

keywords = {Animation, Animation pipeline, Artificial intelligence, Augmented Reality, Character animation, Computational Linguistics, Computer animation, Deep learning, Diffusion, E-Learning, Extended reality, Film production, Generative art, Language Model, Learning systems, Learning techniques, Natural language processing systems, Pipelines, Production pipelines, Virtual Reality},

pubstate = {published},

tppubtype = {inproceedings}

}

Zhang, Q.; Naradowsky, J.; Miyao, Y.

Self-Emotion Blended Dialogue Generation in Social Simulation Agents Proceedings Article

In: Kawahara, T.; Demberg, V.; Ultes, S.; Inoue, K.; Mehri, S.; Howcroft, D.; Komatani, K. (Ed.): pp. 228–247, Association for Computational Linguistics (ACL), 2024, ISBN: 9798891761612 (ISBN).

Abstract | Links | BibTeX | Tags: Agent behavior, Agents, Computational Linguistics, Decision making, Decisions makings, Dialogue generations, Dialogue strategy, Emotional state, Language Model, Model-driven, Natural language processing systems, Simulation framework, Social psychology, Social simulations, Speech processing, Virtual Reality, Virtual simulation environments

@inproceedings{zhang_self-emotion_2024,

title = {Self-Emotion Blended Dialogue Generation in Social Simulation Agents},

author = {Q. Zhang and J. Naradowsky and Y. Miyao},

editor = {T. Kawahara and V. Demberg and S. Ultes and K. Inoue and S. Mehri and D. Howcroft and K. Komatani},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105017744334&doi=10.18653%2Fv1%2F2024.sigdial-1.21&partnerID=40&md5=f185cfb5554eabfa85e6e956dfe6848e},

doi = {10.18653/v1/2024.sigdial-1.21},

isbn = {9798891761612 (ISBN)},

year  = {2024},

date = {2024-01-01},

pages = {228–247},

publisher = {Association for Computational Linguistics (ACL)},

abstract = {When engaging in conversations, dialogue agents in a virtual simulation environment may exhibit their own emotional states that are unrelated to the immediate conversational context, a phenomenon known as self-emotion. This study explores how such self-emotion affects the agents' behaviors in dialogue strategies and decision-making within a large language model (LLM)-driven simulation framework. In a dialogue strategy prediction experiment, we analyze the dialogue strategy choices employed by agents both with and without self-emotion, comparing them to those of humans. The results show that incorporating self-emotion helps agents exhibit more human-like dialogue strategies. In an independent experiment comparing the performance of models fine-tuned on GPT-4 generated dialogue datasets, we demonstrate that self-emotion can lead to better overall naturalness and humanness. Finally, in a virtual simulation environment where agents have discussions on multiple topics, we show that self-emotion of agents can significantly influence the decision-making process of the agents, leading to approximately a 50% change in decisions. © 2025 Elsevier B.V., All rights reserved.},

keywords = {Agent behavior, Agents, Computational Linguistics, Decision making, Decisions makings, Dialogue generations, Dialogue strategy, Emotional state, Language Model, Model-driven, Natural language processing systems, Simulation framework, Social psychology, Social simulations, Speech processing, Virtual Reality, Virtual simulation environments},

pubstate = {published},

tppubtype = {inproceedings}

}

Venkatachalam, N.; Rayana, M.; Vignesh, S. Bala; Prathamesh, S.

Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences Proceedings Article

In: Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT, pp. 1133–1138, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 9798350327533 (ISBN).

Abstract | Links | BibTeX | Tags: Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets

@inproceedings{venkatachalam_voice-driven_2024,

title = {Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences},

author = {N. Venkatachalam and M. Rayana and S. Bala Vignesh and S. Prathamesh},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85190121845&doi=10.1109%2FIDCIoT59759.2024.10467441&partnerID=40&md5=867e723b20fb9fead7d1c55926af9642},

doi = {10.1109/IDCIoT59759.2024.10467441},

isbn = {9798350327533 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT},

pages = {1133–1138},

publisher = {Institute of Electrical and Electronics Engineers Inc.},

abstract = {This research study introduces an innovative system that aims to synthesize 360-degree panoramic images in Realtime based on vocal prompts from the user, leveraging state-of-The-Art Generative AI with a combination of advanced NLP models. The primary objective of this system is to transform spoken descriptions into immersive and interactive visual scenes, specifically designed to provide users with first-person field views. This cutting-edge technology has the potential to revolutionize the realm of virtual reality (VR) experiences, enabling users to effortlessly create and navigate through personalized environments. The fundamental goal of this system is to enable the generation of real-Time images that are seamlessly compatible with VR headsets, offering a truly immersive and adaptive visual experience. Beyond its technological advancements, this research also highlights its significant potential for creating a positive social impact. One notable application lies in psychological interventions, particularly in the context of phobia treatment and therapeutic settings. Here, patients can safely confront and work through their fears within these synthesized environments, potentially offering new avenues for therapy. Furthermore, the system serves educational and entertainment purposes by bringing users' imaginations to life, providing an unparalleled platform for exploring the boundaries of virtual experiences. Overall, this research represents a promising stride towards a more immersive and adaptable future in VR technology, with the potential to enhance various aspects of human lives, from mental health treatment to entertainment and education. © 2024 Elsevier B.V., All rights reserved.},

keywords = {Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets},

pubstate = {published},

tppubtype = {inproceedings}

}

Yin, Z.; Wang, Y.; Papatheodorou, T.; Hui, P.

Text2VRScene: Exploring the Framework of Automated Text-driven Generation System for VR Experience Proceedings Article

In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces, VR, pp. 701–711, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 9798350374025 (ISBN).

Abstract | Links | BibTeX | Tags: Automated systems, Automation, Digital contents, Generation systems, Generative model, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Interaction techniques, Language Model, Natural language processing systems, Text input, User interfaces, Virtual Reality

@inproceedings{yin_text2vrscene_2024,

title = {Text2VRScene: Exploring the Framework of Automated Text-driven Generation System for VR Experience},

author = {Z. Yin and Y. Wang and T. Papatheodorou and P. Hui},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85191431035&doi=10.1109%2FVR58804.2024.00090&partnerID=40&md5=8d04e98b6579e58fb1c6293eac5fa7bc},

doi = {10.1109/VR58804.2024.00090},

isbn = {9798350374025 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces, VR},

pages = {701–711},

publisher = {Institute of Electrical and Electronics Engineers Inc.},

abstract = {With the recent development of the Virtual Reality (VR) industry, the increasing number of VR users pushes the demand for the massive production of immersive and expressive VR scenes in related industries. However, creating expressive VR scenes involves the reasonable organization of various digital content to express a coherent and logical theme, which is time-consuming and labor-intensive. In recent years, Large Language Models (LLMs) such as ChatGPT 3.5 and generative models such as stable diffusion have emerged as powerful tools for comprehending natural language and generating digital contents such as text, code, images, and 3D objects. In this paper, we have explored how we can generate VR scenes from text by incorporating LLMs and various generative models into an automated system. To achieve this, we first identify the possible limitations of LLMs for an automated system and propose a systematic framework to mitigate them. Subsequently, we developed Text2VRScene, a VR scene generation system, based on our proposed framework with well-designed prompts. To validate the effectiveness of our proposed framework and the designed prompts, we carry out a series of test cases. The results show that the proposed framework contributes to improving the reliability of the system and the quality of the generated VR scenes. The results also illustrate the promising performance of the Text2VRScene in generating satisfying VR scenes with a clear theme regularized by our well-designed prompts. This paper ends with a discussion about the limitations of the current system and the potential of developing similar generation systems based on our framework. © 2024 Elsevier B.V., All rights reserved.},

keywords = {Automated systems, Automation, Digital contents, Generation systems, Generative model, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Interaction techniques, Language Model, Natural language processing systems, Text input, User interfaces, Virtual Reality},

pubstate = {published},

tppubtype = {inproceedings}

}

Kapadia, N.; Gokhale, S.; Nepomuceno, A.; Cheng, W.; Bothwell, S.; Mathews, M.; Shallat, J. S.; Schultz, C.; Gupta, A.

Evaluation of Large Language Model Generated Dialogues for an AI Based VR Nurse Training Simulator Proceedings Article

In: J.Y.C., Chen; G., Fragomeni (Ed.): Lect. Notes Comput. Sci., pp. 200–212, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 03029743 (ISSN); 978-303161040-0 (ISBN).

Abstract | Links | BibTeX | Tags: Bard, ChatGPT, ClaudeAI, Clinical research, Computational Linguistics, Dialogue Generation, Dialogue generations, Education computing, Extended reality, Health care education, Healthcare Education, Language Model, Language processing, Large language model, large language models, Natural Language Processing, Natural language processing systems, Natural languages, Nurse Training Simulation, Nursing, Patient avatar, Patient Avatars, Semantics, Students, Training simulation, Virtual Reality

@inproceedings{kapadia_evaluation_2024,

title = {Evaluation of Large Language Model Generated Dialogues for an AI Based VR Nurse Training Simulator},

author = {N. Kapadia and S. Gokhale and A. Nepomuceno and W. Cheng and S. Bothwell and M. Mathews and J. S. Shallat and C. Schultz and A. Gupta},

editor = {Chen J.Y.C. and Fragomeni G.},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85196200653&doi=10.1007%2f978-3-031-61041-7_13&partnerID=40&md5=8890a8d0c289fdf6e7ab82e105249097},

doi = {10.1007/978-3-031-61041-7_13},

isbn = {03029743 (ISSN); 978-303161040-0 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {Lect. Notes Comput. Sci.},

volume = {14706 LNCS},

pages = {200–212},

publisher = {Springer Science and Business Media Deutschland GmbH},

abstract = {This paper explores the efficacy of Large Language Models (LLMs) in generating dialogues for patient avatars in Virtual Reality (VR) nurse training simulators. With the integration of technology in healthcare education evolving rapidly, the potential of NLP to enhance nurse training through realistic patient interactions presents a significant opportunity. Our study introduces a novel LLM-based dialogue generation system, leveraging models such as ChatGPT, GoogleBard, and ClaudeAI. We detail the development of our script generation system, which was a collaborative endeavor involving nurses, technical artists, and developers. The system, tested on the Meta Quest 2 VR headset, integrates complex dialogues created through a synthesis of clinical expertise and advanced NLP, aimed at simulating real-world nursing scenarios. Through a comprehensive evaluation involving lexical and semantic similarity tests compared to clinical expert-generated scripts, we assess the potential of LLMs as suitable alternatives for script generation. The findings aim to contribute to the development of a more interactive and effective VR nurse training simulator, enhancing communication skills among nursing students for improved patient care outcomes. This research underscores the importance of advanced NLP applications in healthcare education, offering insights into the practicality and limitations of employing LLMs in clinical training environments. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.},

keywords = {Bard, ChatGPT, ClaudeAI, Clinical research, Computational Linguistics, Dialogue Generation, Dialogue generations, Education computing, Extended reality, Health care education, Healthcare Education, Language Model, Language processing, Large language model, large language models, Natural Language Processing, Natural language processing systems, Natural languages, Nurse Training Simulation, Nursing, Patient avatar, Patient Avatars, Semantics, Students, Training simulation, Virtual Reality},

pubstate = {published},

tppubtype = {inproceedings}

}

Krauss, C.; Bassbouss, L.; Upravitelev, M.; An, T. -S.; Altun, D.; Reray, L.; Balitzki, E.; Tamimi, T. El; Karagülle, M.

Opportunities and Challenges in Developing Educational AI-Assistants for the Metaverse Proceedings Article

In: R.A., Sottilare; J., Schwarz (Ed.): Lect. Notes Comput. Sci., pp. 219–238, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 03029743 (ISSN); 978-303160608-3 (ISBN).

Abstract | Links | BibTeX | Tags: 3D modeling, AI-assistant, AI-Assistants, Computational Linguistics, Computer aided instruction, Concept-based, E-Learning, Education, Interoperability, Language Model, Large language model, large language models, Learning Environments, Learning systems, Learning Technologies, Learning technology, LLM, Metaverse, Metaverses, Natural language processing systems, Proof of concept, User interfaces, Virtual assistants, Virtual Reality

@inproceedings{krauss_opportunities_2024,

title = {Opportunities and Challenges in Developing Educational AI-Assistants for the Metaverse},

author = {C. Krauss and L. Bassbouss and M. Upravitelev and T. -S. An and D. Altun and L. Reray and E. Balitzki and T. El Tamimi and M. Karagülle},

editor = {Sottilare R.A. and Schwarz J.},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85196214138&doi=10.1007%2f978-3-031-60609-0_16&partnerID=40&md5=9a66876cb30e9e5d287a86e6cfa66e05},

doi = {10.1007/978-3-031-60609-0_16},

isbn = {03029743 (ISSN); 978-303160608-3 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {Lect. Notes Comput. Sci.},

volume = {14727 LNCS},

pages = {219–238},

publisher = {Springer Science and Business Media Deutschland GmbH},

abstract = {The paper explores the opportunities and challenges for metaverse learning environments with AI-Assistants based on Large Language Models. A proof of concept based on popular but proprietary technologies is presented that enables a natural language exchange between the user and an AI-based medical expert in a highly immersive environment based on the Unreal Engine. The answers generated by ChatGPT are not only played back lip-synchronously, but also visualized in the VR environment using a 3D model of a skeleton. Usability and user experience play a particularly important role in the development of the highly immersive AI-Assistant. The proof of concept serves to illustrate the opportunities and challenges that lie in the merging of large language models, metaverse applications and educational ecosystems, which are self-contained research areas. Development strategies, tools and interoperability standards will be presented to facilitate future developments in this triangle of tension. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.},

keywords = {3D modeling, AI-assistant, AI-Assistants, Computational Linguistics, Computer aided instruction, Concept-based, E-Learning, Education, Interoperability, Language Model, Large language model, large language models, Learning Environments, Learning systems, Learning Technologies, Learning technology, LLM, Metaverse, Metaverses, Natural language processing systems, Proof of concept, User interfaces, Virtual assistants, Virtual Reality},

pubstate = {published},

tppubtype = {inproceedings}

}

Pester, A.; Tammaa, A.; Gütl, C.; Steinmaurer, A.; El-Seoud, S. A.

Conversational Agents, Virtual Worlds, and Beyond: A Review of Large Language Models Enabling Immersive Learning Proceedings Article

In: IEEE Global Eng. Edu. Conf., EDUCON, IEEE Computer Society, 2024, ISBN: 21659559 (ISSN); 979-835039402-3 (ISBN).

Abstract | Links | BibTeX | Tags: Computational Linguistics, Computer aided instruction, Conversational Agents, Education, Immersive learning, Language Model, Large language model, Learning systems, Literature reviews, LLM, Metaverse, Metaverses, Natural language processing systems, Pedagogy, Survey literature review, Virtual Reality, Virtual worlds

Jeong, E.; Kim, H.; Park, S.; Yoon, S.; Ahn, J.; Woo, W.

Function-Adaptive Affordance Extraction from 3D Objects Using LLM for Interaction Authoring with Augmented Artifacts Proceedings Article

In: Eck, U.; Sra, M.; Stefanucci, J.; Sugimoto, M.; Tatzgern, M.; Williams, I. (Ed.): Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct, pp. 205–208, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 9798331506919 (ISBN).

Abstract | Links | BibTeX | Tags: 3D modeling, Applied computing, Art and humanity, Artificial intelligence, Arts and humanities, Augmented Reality, Computer interaction, Computer vision, Computing methodologies, computing methodology, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Humanities computing, Interaction paradigm, Interaction paradigms, Language processing, Mixed / augmented reality, Mixed reality, Modeling languages, Natural Language Processing, Natural language processing systems, Natural languages, Three dimensional computer graphics

@inproceedings{jeong_function-adaptive_2024,

title = {Function-Adaptive Affordance Extraction from 3D Objects Using LLM for Interaction Authoring with Augmented Artifacts},

author = {E. Jeong and H. Kim and S. Park and S. Yoon and J. Ahn and W. Woo},

editor = {U. Eck and M. Sra and J. Stefanucci and M. Sugimoto and M. Tatzgern and I. Williams},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85214379963&doi=10.1109%2FISMAR-Adjunct64951.2024.00050&partnerID=40&md5=45841ebd83189e4d3f3190dab9c1ba8c},

doi = {10.1109/ISMAR-Adjunct64951.2024.00050},

isbn = {9798331506919 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct},

pages = {205–208},

publisher = {Institute of Electrical and Electronics Engineers Inc.},

abstract = {We propose an algorithm that extracts the most suitable affordances, interaction targets, and corresponding coordinates adaptively from 3D models of various artifacts based on their functional context for efficient authoring of XR content with artifacts. Traditionally, authoring AR scenes to convey artifact context required one-to-one manual work. Our approach leverages a Large Language Model (LLM) to extract interaction types, positions, and subjects based on the artifact's name and usage context. This enables templated XR experience creation, replacing repetitive manual labor. Consequently, our system streamlines the XR authoring process, making it more efficient and scalable. © 2025 Elsevier B.V., All rights reserved.},

keywords = {3D modeling, Applied computing, Art and humanity, Artificial intelligence, Arts and humanities, Augmented Reality, Computer interaction, Computer vision, Computing methodologies, computing methodology, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Humanities computing, Interaction paradigm, Interaction paradigms, Language processing, Mixed / augmented reality, Mixed reality, Modeling languages, Natural Language Processing, Natural language processing systems, Natural languages, Three dimensional computer graphics},

pubstate = {published},

tppubtype = {inproceedings}

}

Geurts, E.; Warson, D.; Ruiz, G. Rovelo

Boosting Motivation in Sports with Data-Driven Visualizations in VR Proceedings Article

In: ACM Int. Conf. Proc. Ser., Association for Computing Machinery, 2024, ISBN: 979-840071764-2 (ISBN).

Abstract | Links | BibTeX | Tags: Artificial intelligence, Asynchronoi social interaction, Asynchronous social interaction, Cycling, Data driven, Dynamics, Extended reality, Group dynamics, Language Model, Large language model, large language models, Motivation, Natural language processing systems, Real-world, Real-world data, Social interactions, Sports, User interface, User interfaces, Virtual Reality, Visualization, Visualizations

@inproceedings{geurts_boosting_2024,

title = {Boosting Motivation in Sports with Data-Driven Visualizations in VR},

author = {E. Geurts and D. Warson and G. Rovelo Ruiz},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85195387493&doi=10.1145%2f3656650.3656669&partnerID=40&md5=ec69e7abe61e572a94261ad6bbfed11c},

doi = {10.1145/3656650.3656669},

isbn = {979-840071764-2 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {ACM Int. Conf. Proc. Ser.},

publisher = {Association for Computing Machinery},

abstract = {In recent years, the integration of Artificial Intelligence (AI) has sparked revolutionary progress across diverse domains, with sports applications being no exception. At the same time, using real-world data sources, such as GPS, weather, and traffic data, offers opportunities to improve the overall user engagement and effectiveness of such applications. Despite the substantial advancements, including proven success in mobile applications, there remains an untapped potential in leveraging these technologies to boost motivation and enhance social group dynamics in Virtual Reality (VR) sports solutions. Our innovative approach focuses on harnessing the power of AI and real-world data to facilitate the design of such VR systems. To validate our methodology, we conducted an exploratory study involving 18 participants, evaluating our approach within the context of indoor VR cycling. By incorporating GPX files and omnidirectional video (real-world data), we recreated a lifelike cycling environment in which users can compete with simulated cyclists navigating a chosen (real-world) route. Considering the user's performance and interactions with other cyclists, our system employs AI-driven natural language processing tools to generate encouraging and competitive messages automatically. The outcome of our study reveals a positive impact on motivation, competition dynamics, and the perceived sense of group dynamics when using real performance data alongside automatically generated motivational messages. This underscores the potential of AI-driven enhancements in user interfaces to not only optimize performance but also foster a more engaging and supportive sports environment. © 2024 ACM.},

keywords = {Artificial intelligence, Asynchronoi social interaction, Asynchronous social interaction, Cycling, Data driven, Dynamics, Extended reality, Group dynamics, Language Model, Large language model, large language models, Motivation, Natural language processing systems, Real-world, Real-world data, Social interactions, Sports, User interface, User interfaces, Virtual Reality, Visualization, Visualizations},

pubstate = {published},

tppubtype = {inproceedings}

}

Tang, Y.; Situ, J.; Huang, Y.

Beyond User Experience: Technical and Contextual Metrics for Large Language Models in Extended Reality Proceedings Article

In: UbiComp Companion - Companion ACM Int. Jt. Conf. Pervasive Ubiquitous Comput., pp. 640–643, Association for Computing Machinery, Inc, 2024, ISBN: 9798400710582 (ISBN).

Abstract | Links | BibTeX | Tags: Augmented Reality, Computer simulation languages, Evaluation Metrics, Extended reality, Language Model, Large language model, large language models, Mixed reality, Modeling performance, Natural language processing systems, Physical world, Spatial computing, spatial data, user experience, Users' experiences, Virtual environments, Virtual Reality

@inproceedings{tang_beyond_2024,

title = {Beyond User Experience: Technical and Contextual Metrics for Large Language Models in Extended Reality},

author = {Y. Tang and J. Situ and Y. Huang},

url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85206203437&doi=10.1145%2F3675094.3678995&partnerID=40&md5=335c61d650590b084fed80992b7b0072},

doi = {10.1145/3675094.3678995},

isbn = {9798400710582 (ISBN)},

year  = {2024},

date = {2024-01-01},

booktitle = {UbiComp Companion - Companion ACM Int. Jt. Conf. Pervasive Ubiquitous Comput.},

pages = {640–643},

publisher = {Association for Computing Machinery, Inc},

abstract = {Spatial Computing involves interacting with the physical world through spatial data manipulation, closely linked with Extended Reality (XR), which includes Virtual Reality (VR), Augmented Reality (AR), and Mixed Reality (MR). Large Language Models (LLMs) significantly enhance XR applications by improving user interactions through natural language understanding and content generation. Typical evaluations of these applications focus on user experience (UX) metrics, such as task performance, user satisfaction, and psychological assessments, but often neglect the technical performance of the LLMs themselves. This paper identifies significant gaps in current evaluation practices for LLMs within XR environments, attributing them to the novelty of the field, the complexity of spatial contexts, and the multimodal nature of interactions in XR. To address these gaps, the paper proposes specific metrics tailored to evaluate LLM performance in XR contexts, including spatial contextual awareness, coherence, proactivity, multimodal integration, hallucination, and question-answering accuracy. These proposed metrics aim to complement existing UX evaluations, providing a comprehensive assessment framework that captures both the technical and user-centric aspects of LLM performance in XR applications. The conclusion underscores the necessity for a dual-focused approach that combines technical and UX metrics to ensure effective and user-friendly LLM-integrated XR systems. © 2024 Elsevier B.V., All rights reserved.},

keywords = {Augmented Reality, Computer simulation languages, Evaluation Metrics, Extended reality, Language Model, Large language model, large language models, Mixed reality, Modeling performance, Natural language processing systems, Physical world, Spatial computing, spatial data, user experience, Users' experiences, Virtual environments, Virtual Reality},

pubstate = {published},

tppubtype = {inproceedings}

}