AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Mereu, J.
Using LLMs to enhance end-user development support in XR Proceedings Article
In: V., Paneva; D., Tetteroo; V., Frau; S., Feger; D., Spano; F., Paterno; S., Sauer; M., Manca (Ed.): CEUR Workshop Proc., CEUR-WS, 2025, ISBN: 16130073 (ISSN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Condition, Configuration, Development support, Development technique, End-User Development, End-Users, Event-condition-action, Event-Condition-Actions, Extended reality, Human computer interaction, Information Systems, Information use, Natural Language, Natural language processing systems, Natural languages, Rule, rules
@inproceedings{mereu_using_2025,
title = {Using LLMs to enhance end-user development support in XR},
author = {J. Mereu},
editor = {Paneva V. and Tetteroo D. and Frau V. and Feger S. and Spano D. and Paterno F. and Sauer S. and Manca M.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105008755984&partnerID=40&md5=bfaaa38c3bee309621426f8f35332107},
isbn = {16130073 (ISSN)},
year = {2025},
date = {2025-01-01},
booktitle = {CEUR Workshop Proc.},
volume = {3978},
publisher = {CEUR-WS},
abstract = {This paper outlines the center stage of my PhD research, which aims to empower non-developer users to create and customize eXtended Reality (XR) environments through End-User Development (EUD) techniques combined with the latest AI tools. In particular, I describe my contributions to the EUD4XR project, detailing both the work completed and the ongoing developments. EUD4XR seeks to support end-users in customizing XR content with the assistance of a Large Language Model (LLM)-based conversational agent. © 2025 Copyright for this paper by its authors.},
keywords = {Artificial intelligence, Condition, Configuration, Development support, Development technique, End-User Development, End-Users, Event-condition-action, Event-Condition-Actions, Extended reality, Human computer interaction, Information Systems, Information use, Natural Language, Natural language processing systems, Natural languages, Rule, rules},
pubstate = {published},
tppubtype = {inproceedings}
}
Huang, D.; Ge, M.; Xiang, K.; Zhang, X.; Yang, H.
Privacy Preservation of Large Language Models in the Metaverse Era: Research Frontiers, Categorical Comparisons, and Future Directions Proceedings Article
In: Int J Network Manage, John Wiley and Sons Ltd, 2025, ISBN: 10557148 (ISSN).
Abstract | Links | BibTeX | Tags: Adversarial networks, Computational Linguistics, Cryptography, Differential privacies, Excel, Language Model, Large language model, large language models, Life cycle, Metaverse, Metaverses, Natural language processing systems, Natural languages, Privacy preservation, Privacy protection, Research frontiers
@inproceedings{huang_privacy_2025,
title = {Privacy Preservation of Large Language Models in the Metaverse Era: Research Frontiers, Categorical Comparisons, and Future Directions},
author = {D. Huang and M. Ge and K. Xiang and X. Zhang and H. Yang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85199980257&doi=10.1002%2fnem.2292&partnerID=40&md5=2dea1caa1d31aecde3d302a908fb7dd3},
doi = {10.1002/nem.2292},
isbn = {10557148 (ISSN)},
year = {2025},
date = {2025-01-01},
booktitle = {Int J Network Manage},
volume = {35},
publisher = {John Wiley and Sons Ltd},
abstract = {Large language models (LLMs), with their billions to trillions of parameters, excel in natural language processing, machine translation, dialog systems, and text summarization. These capabilities are increasingly pivotal in the metaverse, where they can enhance virtual interactions and environments. However, their extensive use, particularly in the metaverse's immersive platforms, raises significant privacy concerns. This paper analyzes existing privacy issues in LLMs, vital for both traditional and metaverse applications, and examines protection techniques across the entire life cycle of these models, from training to user deployment. We delve into cryptography, embedding layer encoding, differential privacy and its variants, and adversarial networks, highlighting their relevance in the metaverse context. Specifically, we explore technologies like homomorphic encryption and secure multiparty computation, which are essential for metaverse security. Our discussion on Gaussian differential privacy, Renyi differential privacy, Edgeworth accounting, and the generation of adversarial samples and loss functions emphasizes their importance in the metaverse's dynamic and interactive environments. Lastly, the paper discusses the current research status and future challenges in the security of LLMs within and beyond the metaverse, emphasizing urgent problems and potential areas for exploration. © 2024 John Wiley & Sons Ltd.},
keywords = {Adversarial networks, Computational Linguistics, Cryptography, Differential privacies, Excel, Language Model, Large language model, large language models, Life cycle, Metaverse, Metaverses, Natural language processing systems, Natural languages, Privacy preservation, Privacy protection, Research frontiers},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, J.; Wu, X.; Lan, T.; Li, B.
LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2715–2724, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: % reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality
@article{chen_llmer_2025,
title = {LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models},
author = {J. Chen and X. Wu and T. Lan and B. Li},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003825793&doi=10.1109%2fTVCG.2025.3549549&partnerID=40&md5=da4681d0714548e3a7e0c8c3295d2348},
doi = {10.1109/TVCG.2025.3549549},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2715–2724},
abstract = {The integration of Large Language Models (LLMs) like GPT-4 with Extended Reality (XR) technologies offers the potential to build truly immersive XR environments that interact with human users through natural language, e.g., generating and animating 3D scenes from audio inputs. However, the complexity of XR environments makes it difficult to accurately extract relevant contextual data and scene/object parameters from an overwhelming volume of XR artifacts. It leads to not only increased costs with pay-per-use models, but also elevated levels of generation errors. Moreover, existing approaches focusing on coding script generation are often prone to generation errors, resulting in flawed or invalid scripts, application crashes, and ultimately a degraded user experience. To overcome these challenges, we introduce LLMER, a novel framework that creates interactive XR worlds using JSON data generated by LLMs. Unlike prior approaches focusing on coding script generation, LLMER translates natural language inputs into JSON data, significantly reducing the likelihood of application crashes and processing latency. It employs a multi-stage strategy to supply only the essential contextual information adapted to the user's request and features multiple modules designed for various XR tasks. Our preliminary user study reveals the effectiveness of the proposed system, with over 80% reduction in consumed tokens and around 60% reduction in task completion time compared to state-of-the-art approaches. The analysis of users' feedback also illuminates a series of directions for further optimization. © 1995-2012 IEEE.},
keywords = {% reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality},
pubstate = {published},
tppubtype = {article}
}
Ding, S.; Chen, Y.
RAG-VR: Leveraging Retrieval-Augmented Generation for 3D Question Answering in VR Environments Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 131–136, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833151484-6 (ISBN).
Abstract | Links | BibTeX | Tags: Ambient intelligence, Computational Linguistics, Computer interaction, Computing methodologies, Computing methodologies-Artificial intelligence-Natural language processing-Natural language generation, Computing methodology-artificial intelligence-natural language processing-natural language generation, Data handling, Formal languages, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Language Model, Language processing, Natural language generation, Natural language processing systems, Natural languages, Virtual Reality, Word processing
@inproceedings{ding_rag-vr_2025,
title = {RAG-VR: Leveraging Retrieval-Augmented Generation for 3D Question Answering in VR Environments},
author = {S. Ding and Y. Chen},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005140593&doi=10.1109%2fVRW66409.2025.00034&partnerID=40&md5=36dc5fef97aeea4d6e183c83ce9fcd89},
doi = {10.1109/VRW66409.2025.00034},
isbn = {979-833151484-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},
pages = {131–136},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Recent advances in large language models (LLMs) provide new opportunities for context understanding in virtual reality (VR). However, VR contexts are often highly localized and personalized, limiting the effectiveness of general-purpose LLMs. To address this challenge, we present RAG-VR, the first 3D question-answering system for VR that incorporates retrieval-augmented generation (RAG), which augments an LLM with external knowledge retrieved from a localized knowledge database to improve the answer quality. RAG-VR includes a pipeline for extracting comprehensive knowledge about virtual environments and user conditions for accurate answer generation. To ensure efficient retrieval, RAG-VR offloads the retrieval process to a nearby edge server and uses only essential information during retrieval. Moreover, we train the retriever to effectively distinguish among relevant, irrelevant, and hard-to-differentiate information in relation to questions. RAG-VR improves answer accuracy by 17.9%-41.8% and reduces end-to-end latency by 34.5%-47.3% compared with two baseline systems. © 2025 IEEE.},
keywords = {Ambient intelligence, Computational Linguistics, Computer interaction, Computing methodologies, Computing methodologies-Artificial intelligence-Natural language processing-Natural language generation, Computing methodology-artificial intelligence-natural language processing-natural language generation, Data handling, Formal languages, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Interaction paradigm, Interaction paradigms, Language Model, Language processing, Natural language generation, Natural language processing systems, Natural languages, Virtual Reality, Word processing},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
Shrestha, A.; Imamoto, K.
Generative AI based industrial metaverse creation methodology Proceedings Article
In: Proc. - Artif. Intell. Bus., AIxB, pp. 53–57, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835039103-9 (ISBN).
Abstract | Links | BibTeX | Tags: Generative adversarial networks, Generative AI, Industrial metaverse, Industrial railroads, Investments, Maintenance and operation, Metaverses, Natural languages, Railroad transportation, Railway, Railway maintenance, Railway operations, Simple++, simulation
@inproceedings{shrestha_generative_2024,
title = {Generative AI based industrial metaverse creation methodology},
author = {A. Shrestha and K. Imamoto},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85215066217&doi=10.1109%2fAIxB62249.2024.00017&partnerID=40&md5=d6d11729f16ccaa9f69fd5452befe492},
doi = {10.1109/AIxB62249.2024.00017},
isbn = {979-835039103-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - Artif. Intell. Bus., AIxB},
pages = {53–57},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {The metaverse has been proposed as a suitable apparatus for the dissemination of information in a railway maintenance and operation context. However, the generation of such a metaverse environment requires significant investment with the creation of simple prototypes taking an extended duration. Although there are generative artificial intelligencebased methods to create small scenes, there is an absence of a method to do so for industrial applications. We devised a platform to create railway environments with the assistance of the language models for code creation and semantic inference without the need for reprogramming or editing of the project source meaning environments could be generated by the end users. With a natural language input and a coding paradigm output the code generation module is shown together with the example environments from real-life railway lines in Tokyo, Japan as preliminary results. By creating such environments leveraging the rapid generation with the help of generative artificial intelligence, we show generative artificial intelligence can be used to automate the task of the programmer to create new environments on demand from the user in natural language. © 2024 IEEE.},
keywords = {Generative adversarial networks, Generative AI, Industrial metaverse, Industrial railroads, Investments, Maintenance and operation, Metaverses, Natural languages, Railroad transportation, Railway, Railway maintenance, Railway operations, Simple++, simulation},
pubstate = {published},
tppubtype = {inproceedings}
}
Venkatachalam, N.; Rayana, M.; Vignesh, S. Bala; Prathamesh, S.
Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences Proceedings Article
In: Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT, pp. 1133–1138, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835032753-3 (ISBN).
Abstract | Links | BibTeX | Tags: Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets
@inproceedings{venkatachalam_voice-driven_2024,
title = {Voice-Driven Panoramic Imagery: Real-Time Generative AI for Immersive Experiences},
author = {N. Venkatachalam and M. Rayana and S. Bala Vignesh and S. Prathamesh},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85190121845&doi=10.1109%2fIDCIoT59759.2024.10467441&partnerID=40&md5=6594fbab013d9156b79a887f0d7209cb},
doi = {10.1109/IDCIoT59759.2024.10467441},
isbn = {979-835032753-3 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Int. Conf. Intell. Data Commun. Technol. Internet Things, IDCIoT},
pages = {1133–1138},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {This research study introduces an innovative system that aims to synthesize 360-degree panoramic images in Realtime based on vocal prompts from the user, leveraging state-of-The-Art Generative AI with a combination of advanced NLP models. The primary objective of this system is to transform spoken descriptions into immersive and interactive visual scenes, specifically designed to provide users with first-person field views. This cutting-edge technology has the potential to revolutionize the realm of virtual reality (VR) experiences, enabling users to effortlessly create and navigate through personalized environments. The fundamental goal of this system is to enable the generation of real-Time images that are seamlessly compatible with VR headsets, offering a truly immersive and adaptive visual experience. Beyond its technological advancements, this research also highlights its significant potential for creating a positive social impact. One notable application lies in psychological interventions, particularly in the context of phobia treatment and therapeutic settings. Here, patients can safely confront and work through their fears within these synthesized environments, potentially offering new avenues for therapy. Furthermore, the system serves educational and entertainment purposes by bringing users' imaginations to life, providing an unparalleled platform for exploring the boundaries of virtual experiences. Overall, this research represents a promising stride towards a more immersive and adaptable future in VR technology, with the potential to enhance various aspects of human lives, from mental health treatment to entertainment and education. © 2024 IEEE.},
keywords = {Adaptive Visual Experience, First person, First-Person view, generative artificial intelligence, Generative Artificial Intelligence (AI), Image processing, Immersive, Immersive visual scene, Immersive Visual Scenes, Language processing, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Panoramic Images, Patient treatment, Personalized environment, Personalized Environments, Phobia Treatment, Prompt, prompts, Psychological intervention, Psychological Interventions, Real-Time Synthesis, User interaction, User interfaces, Virtual experience, Virtual Experiences, Virtual Reality, Virtual Reality (VR), Virtual-reality headsets, Visual experiences, Visual languages, Visual scene, Voice command, Voice commands, VR Headsets},
pubstate = {published},
tppubtype = {inproceedings}
}
Jeong, E.; Kim, H.; Park, S.; Yoon, S.; Ahn, J.; Woo, W.
Function-Adaptive Affordance Extraction from 3D Objects Using LLM for Interaction Authoring with Augmented Artifacts Proceedings Article
In: U., Eck; M., Sra; J., Stefanucci; M., Sugimoto; M., Tatzgern; I., Williams (Ed.): Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct, pp. 205–208, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-833150691-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, Applied computing, Art and humanity, Artificial intelligence, Arts and humanities, Augmented Reality, Computer interaction, Computer vision, Computing methodologies, computing methodology, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Humanities computing, Interaction paradigm, Interaction paradigms, Language processing, Mixed / augmented reality, Mixed reality, Modeling languages, Natural Language Processing, Natural language processing systems, Natural languages, Three dimensional computer graphics
@inproceedings{jeong_function-adaptive_2024,
title = {Function-Adaptive Affordance Extraction from 3D Objects Using LLM for Interaction Authoring with Augmented Artifacts},
author = {E. Jeong and H. Kim and S. Park and S. Yoon and J. Ahn and W. Woo},
editor = {Eck U. and Sra M. and Stefanucci J. and Sugimoto M. and Tatzgern M. and Williams I.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85214379963&doi=10.1109%2fISMAR-Adjunct64951.2024.00050&partnerID=40&md5=7222e0599a7e2aa0adaea38e4b9e13cc},
doi = {10.1109/ISMAR-Adjunct64951.2024.00050},
isbn = {979-833150691-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct},
pages = {205–208},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {We propose an algorithm that extracts the most suitable affordances, interaction targets, and corresponding coordinates adaptively from 3D models of various artifacts based on their functional context for efficient authoring of XR content with artifacts. Traditionally, authoring AR scenes to convey artifact context required one-to-one manual work. Our approach leverages a Large Language Model (LLM) to extract interaction types, positions, and subjects based on the artifact's name and usage context. This enables templated XR experience creation, replacing repetitive manual labor. Consequently, our system streamlines the XR authoring process, making it more efficient and scalable. © 2024 IEEE.},
keywords = {3D modeling, Applied computing, Art and humanity, Artificial intelligence, Arts and humanities, Augmented Reality, Computer interaction, Computer vision, Computing methodologies, computing methodology, Human computer interaction, Human computer interaction (HCI), Human-centered computing, Humanities computing, Interaction paradigm, Interaction paradigms, Language processing, Mixed / augmented reality, Mixed reality, Modeling languages, Natural Language Processing, Natural language processing systems, Natural languages, Three dimensional computer graphics},
pubstate = {published},
tppubtype = {inproceedings}
}
Cronin, I.
Apress Media LLC, 2024, ISBN: 979-886880282-9 (ISBN); 979-886880281-2 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Augmented Reality, Autonomous system, Autonomous systems, Business applications, Computer vision, Decision making, Gaussian Splatting, Gaussians, Generative AI, Language processing, Learning algorithms, Learning systems, machine learning, Machine-learning, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Splatting
@book{cronin_understanding_2024,
title = {Understanding Generative AI Business Applications: A Guide to Technical Principles and Real-World Applications},
author = {I. Cronin},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001777571&doi=10.1007%2f979-8-8688-0282-9&partnerID=40&md5=c0714ff3e1ad755596426ea092b830d6},
doi = {10.1007/979-8-8688-0282-9},
isbn = {979-886880282-9 (ISBN); 979-886880281-2 (ISBN)},
year = {2024},
date = {2024-01-01},
publisher = {Apress Media LLC},
series = {Understanding Generative AI Business Applications: A Guide to Technical Principles and Real-World Applications},
abstract = {This guide covers the fundamental technical principles and various business applications of Generative AI for planning, developing, and evaluating AI-driven products. It equips you with the knowledge you need to harness the potential of Generative AI for enhancing business creativity and productivity. The book is organized into three sections: text-based, senses-based, and rationale-based. Each section provides an in-depth exploration of the specific methods and applications of Generative AI. In the text-based section, you will find detailed discussions on designing algorithms to automate and enhance written communication, including insights into the technical aspects of transformer-based Natural Language Processing (NLP) and chatbot architecture, such as GPT-4, Claude 2, Google Bard, and others. The senses-based section offers a glimpse into the algorithms and data structures that underpin visual, auditory, and multisensory experiences, including NeRF, 3D Gaussian Splatting, Stable Diffusion, AR and VR technologies, and more. The rationale-based section illuminates the decision-making capabilities of AI, with a focus on machine learning and data analytics techniques that empower applications such as simulation models, agents, and autonomous systems. In summary, this book serves as a guide for those seeking to navigate the dynamic landscape of Generative AI. Whether you’re a seasoned AI professional or a business leader looking to harness the power of creative automation, these pages offer a roadmap to leverage Generative AI for your organization’s success. © 2024 by Irena Cronin.},
keywords = {Artificial intelligence, Augmented Reality, Autonomous system, Autonomous systems, Business applications, Computer vision, Decision making, Gaussian Splatting, Gaussians, Generative AI, Language processing, Learning algorithms, Learning systems, machine learning, Machine-learning, Natural Language Processing, Natural Language Processing (NLP), Natural language processing systems, Natural languages, Splatting},
pubstate = {published},
tppubtype = {book}
}
Kapadia, N.; Gokhale, S.; Nepomuceno, A.; Cheng, W.; Bothwell, S.; Mathews, M.; Shallat, J. S.; Schultz, C.; Gupta, A.
Evaluation of Large Language Model Generated Dialogues for an AI Based VR Nurse Training Simulator Proceedings Article
In: J.Y.C., Chen; G., Fragomeni (Ed.): Lect. Notes Comput. Sci., pp. 200–212, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 03029743 (ISSN); 978-303161040-0 (ISBN).
Abstract | Links | BibTeX | Tags: Bard, ChatGPT, ClaudeAI, Clinical research, Computational Linguistics, Dialogue Generation, Dialogue generations, Education computing, Extended reality, Health care education, Healthcare Education, Language Model, Language processing, Large language model, large language models, Natural Language Processing, Natural language processing systems, Natural languages, Nurse Training Simulation, Nursing, Patient avatar, Patient Avatars, Semantics, Students, Training simulation, Virtual Reality
@inproceedings{kapadia_evaluation_2024,
title = {Evaluation of Large Language Model Generated Dialogues for an AI Based VR Nurse Training Simulator},
author = {N. Kapadia and S. Gokhale and A. Nepomuceno and W. Cheng and S. Bothwell and M. Mathews and J. S. Shallat and C. Schultz and A. Gupta},
editor = {Chen J.Y.C. and Fragomeni G.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85196200653&doi=10.1007%2f978-3-031-61041-7_13&partnerID=40&md5=8890a8d0c289fdf6e7ab82e105249097},
doi = {10.1007/978-3-031-61041-7_13},
isbn = {03029743 (ISSN); 978-303161040-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {14706 LNCS},
pages = {200–212},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {This paper explores the efficacy of Large Language Models (LLMs) in generating dialogues for patient avatars in Virtual Reality (VR) nurse training simulators. With the integration of technology in healthcare education evolving rapidly, the potential of NLP to enhance nurse training through realistic patient interactions presents a significant opportunity. Our study introduces a novel LLM-based dialogue generation system, leveraging models such as ChatGPT, GoogleBard, and ClaudeAI. We detail the development of our script generation system, which was a collaborative endeavor involving nurses, technical artists, and developers. The system, tested on the Meta Quest 2 VR headset, integrates complex dialogues created through a synthesis of clinical expertise and advanced NLP, aimed at simulating real-world nursing scenarios. Through a comprehensive evaluation involving lexical and semantic similarity tests compared to clinical expert-generated scripts, we assess the potential of LLMs as suitable alternatives for script generation. The findings aim to contribute to the development of a more interactive and effective VR nurse training simulator, enhancing communication skills among nursing students for improved patient care outcomes. This research underscores the importance of advanced NLP applications in healthcare education, offering insights into the practicality and limitations of employing LLMs in clinical training environments. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.},
keywords = {Bard, ChatGPT, ClaudeAI, Clinical research, Computational Linguistics, Dialogue Generation, Dialogue generations, Education computing, Extended reality, Health care education, Healthcare Education, Language Model, Language processing, Large language model, large language models, Natural Language Processing, Natural language processing systems, Natural languages, Nurse Training Simulation, Nursing, Patient avatar, Patient Avatars, Semantics, Students, Training simulation, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Manesh, S. A.; Zhang, T.; Onishi, Y.; Hara, K.; Bateman, S.; Li, J.; Tang, A.
How People Prompt Generative AI to Create Interactive VR Scenes Proceedings Article
In: A., Vallgarda; L., Jonsson; J., Fritsch; S.F., Alaoui; C.A., Le Dantec (Ed.): Proc. ACM Des. Interact. Syst. Conf., pp. 2319–2340, Association for Computing Machinery, Inc, 2024, ISBN: 979-840070583-0 (ISBN).
Abstract | Links | BibTeX | Tags: Embodied interaction, Embodied knowledge, Embodied prompting, Generative AI, Interactive virtual reality, Multi-modal, Natural languages, Programming agents, Prompting, User interfaces, Virtual Reality, Wizard of Oz
@inproceedings{manesh_how_2024,
title = {How People Prompt Generative AI to Create Interactive VR Scenes},
author = {S. A. Manesh and T. Zhang and Y. Onishi and K. Hara and S. Bateman and J. Li and A. Tang},
editor = {Vallgarda A. and Jonsson L. and Fritsch J. and Alaoui S.F. and Le Dantec C.A.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85200348302&doi=10.1145%2f3643834.3661547&partnerID=40&md5=11831bb65214fd75905ccdaeb8356cdf},
doi = {10.1145/3643834.3661547},
isbn = {979-840070583-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. ACM Des. Interact. Syst. Conf.},
pages = {2319–2340},
publisher = {Association for Computing Machinery, Inc},
abstract = {Generative AI tools can provide people with the ability to create virtual environments and scenes with natural language prompts. Yet, how people will formulate such prompts is unclear—particularly when they inhabit the environment that they are designing. For instance, it is likely that a person might say, “Put a chair here,” while pointing at a location. If such linguistic and embodied features are common to people’s prompts, we need to tune models to accommodate them. In this work, we present a Wizard of Oz elicitation study with 22 participants, where we studied people’s implicit expectations when verbally prompting such programming agents to create interactive VR scenes. Our fndings show when people prompted the agent, they had several implicit expectations of these agents: (1) they should have an embodied knowledge of the environment; (2) they should understand embodied prompts by users; (3) they should recall previous states of the scene and the conversation, and that (4) they should have a commonsense understanding of objects in the scene. Further, we found that participants prompted diferently when they were prompting in situ (i.e. within the VR environment) versus ex situ (i.e. viewing the VR environment from the outside). To explore how these lessons could be applied, we designed and built Ostaad, a conversational programming agent that allows non-programmers to design interactive VR experiences that they inhabit. Based on these explorations, we outline new opportunities and challenges for conversational programming agents that create VR environments. © 2024 Copyright held by the owner/author(s).},
keywords = {Embodied interaction, Embodied knowledge, Embodied prompting, Generative AI, Interactive virtual reality, Multi-modal, Natural languages, Programming agents, Prompting, User interfaces, Virtual Reality, Wizard of Oz},
pubstate = {published},
tppubtype = {inproceedings}
}
Cuervo-Rosillo, R.; Zarraonandia, T.; Díaz, P.
Using Generative AI to Support Non-Experts in the Creation of Immersive Experiences Proceedings Article
In: ACM Int. Conf. Proc. Ser., Association for Computing Machinery, 2024, ISBN: 979-840071764-2 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, End-Users, generative artificial intelligence, Immersive, immersive experience, Immersive Experiences, Natural languages, Speech commands, User interfaces, Virtual Reality
@inproceedings{cuervo-rosillo_using_2024,
title = {Using Generative AI to Support Non-Experts in the Creation of Immersive Experiences},
author = {R. Cuervo-Rosillo and T. Zarraonandia and P. Díaz},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85195422750&doi=10.1145%2f3656650.3656733&partnerID=40&md5=00d53df1d6b30acc6d281bb86ead73ab},
doi = {10.1145/3656650.3656733},
isbn = {979-840071764-2 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {ACM Int. Conf. Proc. Ser.},
publisher = {Association for Computing Machinery},
abstract = {This work focuses on exploring the use of Generative Artificial Intelligence to assist end-users in creating immersive experiences. We present a prototype that supports the creation and edition of virtual environments using speech commands expressed in natural language. © 2024 Owner/Author.},
keywords = {Artificial intelligence, End-Users, generative artificial intelligence, Immersive, immersive experience, Immersive Experiences, Natural languages, Speech commands, User interfaces, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Kang, Z.; Liu, Y.; Zheng, J.; Sun, Z.
Revealing the Difficulty in Jailbreak Defense on Language Models for Metaverse Proceedings Article
In: Q., Gong; X., He (Ed.): SocialMeta - Proc. Int. Workshop Soc. Metaverse Comput., Sens. Netw., Part: ACM SenSys, pp. 31–37, Association for Computing Machinery, Inc, 2024, ISBN: 979-840071299-9 (ISBN).
Abstract | Links | BibTeX | Tags: % reductions, Attack strategies, Computer simulation languages, Defense, Digital elevation model, Guard rails, Jailbreak, Language Model, Large language model, Metaverse Security, Metaverses, Natural languages, Performance, Virtual Reality
@inproceedings{kang_revealing_2024,
title = {Revealing the Difficulty in Jailbreak Defense on Language Models for Metaverse},
author = {Z. Kang and Y. Liu and J. Zheng and Z. Sun},
editor = {Gong Q. and He X.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85212189363&doi=10.1145%2f3698387.3699998&partnerID=40&md5=673326728c3db35ffbbaf807eb7f003c},
doi = {10.1145/3698387.3699998},
isbn = {979-840071299-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {SocialMeta - Proc. Int. Workshop Soc. Metaverse Comput., Sens. Netw., Part: ACM SenSys},
pages = {31–37},
publisher = {Association for Computing Machinery, Inc},
abstract = {Large language models (LLMs) have demonstrated exceptional capabilities in natural language processing tasks, fueling innovations in emerging areas such as the metaverse. These models enable dynamic virtual communities, enhancing user interactions and revolutionizing industries. However, their increasing deployment exposes vulnerabilities to jailbreak attacks, where adversaries can manipulate LLM-driven systems to generate harmful content. While various defense mechanisms have been proposed, their efficacy against diverse jailbreak techniques remains unclear. This paper addresses this gap by evaluating the performance of three popular defense methods (Backtranslation, Self-reminder, and Paraphrase) against different jailbreak attack strategies (GCG, BEAST, and Deepinception), while also utilizing three distinct models. Our findings reveal that while defenses are highly effective against optimization-based jailbreak attacks and reduce the attack success rate by 79% on average, they struggle in defending against attacks that alter attack motivations. Additionally, methods relying on self-reminding perform better when integrated with models featuring robust safety guardrails. For instance, Llama2-7b shows a 100% reduction in Attack Success Rate, while Vicuna-7b and Mistral-7b, lacking safety alignment, exhibit a lower average reduction of 65.8%. This study highlights the challenges in developing universal defense solutions for securing LLMs in dynamic environments like the metaverse. Furthermore, our study highlights that the three distinct models utilized demonstrate varying initial defense performance against different jailbreak attack strategies, underscoring the complexity of effectively securing LLMs. © 2024 Copyright held by the owner/author(s).},
keywords = {% reductions, Attack strategies, Computer simulation languages, Defense, Digital elevation model, Guard rails, Jailbreak, Language Model, Large language model, Metaverse Security, Metaverses, Natural languages, Performance, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Kouzelis, L. R.; Spantidi, O.
Synthesizing Play-Ready VR Scenes with Natural Language Prompts Through GPT API Proceedings Article
In: G., Bebis; G., Ghiasi; Y., Fang; A., Sharf; Y., Dong; C., Weaver; Z., Leo; J.J., LaViola Jr.; L., Kohli (Ed.): Lect. Notes Comput. Sci., pp. 15–26, Springer Science and Business Media Deutschland GmbH, 2023, ISBN: 03029743 (ISSN); 978-303147965-6 (ISBN).
Abstract | Links | BibTeX | Tags: 3-d designs, 3D object, 3D scenes, AI-driven 3D Design, Language Model, Natural languages, Novel methodology, Scene Generation, Three dimensional computer graphics, Unity3d, Virtual Reality, Visual computing
@inproceedings{kouzelis_synthesizing_2023,
title = {Synthesizing Play-Ready VR Scenes with Natural Language Prompts Through GPT API},
author = {L. R. Kouzelis and O. Spantidi},
editor = {Bebis G. and Ghiasi G. and Fang Y. and Sharf A. and Dong Y. and Weaver C. and Leo Z. and LaViola Jr. J.J. and Kohli L.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85180626887&doi=10.1007%2f978-3-031-47966-3_2&partnerID=40&md5=d15c3e2f3260e2a68bdca91c29df7bbb},
doi = {10.1007/978-3-031-47966-3_2},
isbn = {03029743 (ISSN); 978-303147965-6 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {14362},
pages = {15–26},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {In visual computing, 3D scene generation stands as a crucial component, offering applications in various fields such as gaming, virtual reality (VR), and architectural visualization. Creating realistic and versatile virtual environments, however, poses significant challenges. This work presents a novel methodology that leverages the capabilities of a widely adopted large language model (LLM) to address these challenges. Our approach utilizes the GPT API to interpret natural language prompts and generate detailed, VR-ready scenes within Unity3D. Our work is also inherently scalable, since the model accepts any database of 3D objects with minimal prior configuration. The effectiveness of the proposed system is demonstrated through a series of case studies, revealing its potential to generate diverse and functional virtual spaces. © 2023, The Author(s), under exclusive license to Springer Nature Switzerland AG.},
keywords = {3-d designs, 3D object, 3D scenes, AI-driven 3D Design, Language Model, Natural languages, Novel methodology, Scene Generation, Three dimensional computer graphics, Unity3d, Virtual Reality, Visual computing},
pubstate = {published},
tppubtype = {inproceedings}
}
Fuchs, A.; Appel, S.; Grimm, P.
Immersive Spaces for Creativity: Smart Working Environments Proceedings Article
In: A.A., Yunanto; A.D., Ramadhani; Y.R., Prayogi; P.A.M., Putra; M., Ruswiansari; M., Ridwan; F., Gamar; W.M., Rahmawati; M.R., Rusli; F.M., Humaira; A.F., Adila (Ed.): IES - Int. Electron. Symp.: Unlocking Potential Immersive Technol. Live Better Life, Proceeding, pp. 610–617, Institute of Electrical and Electronics Engineers Inc., 2023, ISBN: 979-835031473-1 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Generative AI, Human computer interaction, Immersive, Innovative approaches, Intelligent systems, Interactive Environments, Language Model, Language processing, Large language model, large language models, Learning algorithms, machine learning, Natural language processing systems, Natural languages, User behaviors, User interfaces, Virtual Reality, Working environment
@inproceedings{fuchs_immersive_2023,
title = {Immersive Spaces for Creativity: Smart Working Environments},
author = {A. Fuchs and S. Appel and P. Grimm},
editor = {Yunanto A.A. and Ramadhani A.D. and Prayogi Y.R. and Putra P.A.M. and Ruswiansari M. and Ridwan M. and Gamar F. and Rahmawati W.M. and Rusli M.R. and Humaira F.M. and Adila A.F.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85173627291&doi=10.1109%2fIES59143.2023.10242458&partnerID=40&md5=6ab1796f68c29d7747574272314a2e9d},
doi = {10.1109/IES59143.2023.10242458},
isbn = {979-835031473-1 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {IES - Int. Electron. Symp.: Unlocking Potential Immersive Technol. Live Better Life, Proceeding},
pages = {610–617},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {This paper presents an innovative approach to designing an immersive space that dynamically supports users (inter-)action based on users' behavior, voice, and mood, providing a personalized experience. The objective of this research is to explore how a space can communicate with users in a seamless, engaging, and interactive environment. Therefore, it integrates natural language processing (NLP), generative artificial intelligence applications and human computer interaction that utilizes a combination of sensors, microphones, and cameras to collect real-time data on users' behavior, voice, and mood. This data is then processed and analyzed by an intelligent system that employs machine learning algorithms to identify patterns and adapt the environment accordingly. The adaptive features include changes in lighting, sound, and visual elements to facilitate creativity, focus, relaxation, or socialization, depending on the user's topics and emotional state. The paper discusses the technical aspects of implementing such a system. Additionally, it highlights the potential applications of this technology in various domains such as education, entertainment, and workplace settings. In conclusion, the immersive creative space represents a paradigm shift in human-environment interaction, offering a dynamic and personalized space that caters to the diverse needs of users. The research findings suggest that this innovative approach holds great promise for enhancing user experiences, fostering creativity, and promoting overall well-being. © 2023 IEEE.},
keywords = {Artificial intelligence, Generative AI, Human computer interaction, Immersive, Innovative approaches, Intelligent systems, Interactive Environments, Language Model, Language processing, Large language model, large language models, Learning algorithms, machine learning, Natural language processing systems, Natural languages, User behaviors, User interfaces, Virtual Reality, Working environment},
pubstate = {published},
tppubtype = {inproceedings}
}
Vincent, B.; Ayyar, K.
Roblox Generative AI in action Proceedings Article
In: S.N., Spencer (Ed.): Proc. - SIGGRAPH Real-Time Live!, Association for Computing Machinery, Inc, 2023, ISBN: 979-840070158-0 (ISBN).
Abstract | Links | BibTeX | Tags: AI techniques, Complex model, Creation process, Education, Game, Games, Interactive computer graphics, Interactive objects, Lighting, Metaverse, Metaverses, Modeling, Modeling languages, Natural languages, Object and scenes, Pipeline, Real-Time Rendering, Rendering (computer graphics)
@inproceedings{vincent_roblox_2023,
title = {Roblox Generative AI in action},
author = {B. Vincent and K. Ayyar},
editor = {Spencer S.N.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85167946022&doi=10.1145%2f3588430.3597250&partnerID=40&md5=61fda81c33eb3623240f7d14f51607b0},
doi = {10.1145/3588430.3597250},
isbn = {979-840070158-0 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Proc. - SIGGRAPH Real-Time Live!},
publisher = {Association for Computing Machinery, Inc},
abstract = {Roblox is investing in generative AI techniques to revolutionize the creation process on its platform. By leveraging natural language and other intuitive expressions of intent, creators can build interactive objects and scenes without complex modeling or coding. The use of AI image generation services and large language models aim to make creation faster and easier for every user on the platform. © 2023 Owner/Author.},
keywords = {AI techniques, Complex model, Creation process, Education, Game, Games, Interactive computer graphics, Interactive objects, Lighting, Metaverse, Metaverses, Modeling, Modeling languages, Natural languages, Object and scenes, Pipeline, Real-Time Rendering, Rendering (computer graphics)},
pubstate = {published},
tppubtype = {inproceedings}
}
Joseph, S.; Priya, B. S.; Poorvaja, R.; Kumaran, M. Santhosh; Shivaraj, S.; Jeyanth, V.; Shivesh, R. P.
IoT Empowered AI: Transforming Object Recognition and NLP Summarization with Generative AI Proceedings Article
In: K.V., Arya; T., Wada (Ed.): Proc. IEEE Int. Conf. Comput. Vis. Mach. Intell., CVMI, Institute of Electrical and Electronics Engineers Inc., 2023, ISBN: 979-835030514-2 (ISBN).
Abstract | Links | BibTeX | Tags: 2D, 3D, Application program interface, Application Program Interface (API), Application program interfaces, Application programming interfaces (API), Application programs, Augmented Reality, Augmented Reality(AR), Automation, Cameras, Cost effectiveness, Domestic appliances, GenAl, Internet of Things, Internet of Things (IoT) technologies, Internet of things technologies, Language processing, Natural Language Processing, Natural language processing systems, Natural languages, Object Detection, Object recognition, Objects detection, Optical character recognition, Optical Character Recognition (OCR), Smartphones
@inproceedings{joseph_iot_2023,
title = {IoT Empowered AI: Transforming Object Recognition and NLP Summarization with Generative AI},
author = {S. Joseph and B. S. Priya and R. Poorvaja and M. Santhosh Kumaran and S. Shivaraj and V. Jeyanth and R. P. Shivesh},
editor = {Arya K.V. and Wada T.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85189754688&doi=10.1109%2fCVMI59935.2023.10465077&partnerID=40&md5=9c1a9d7151c0b04bab83586f515d30aa},
doi = {10.1109/CVMI59935.2023.10465077},
isbn = {979-835030514-2 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Proc. IEEE Int. Conf. Comput. Vis. Mach. Intell., CVMI},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {In anticipation of the widespread adoption of augmented reality in the future, this paper introduces an advanced mobile application that seamlessly integrates AR and IoT technologies. The application aims to make these cutting-edge technologies more affordable and accessible to users while highlighting their immense benefits in assisting with household appliance control, as well as providing interactive and educational experiences. The app employs advanced algorithms such as object detection, Natural Language Processing (NLP), and Optical Character Recognition (OCR) to scan the smartphone's camera feed. Upon identification, AR controls for appliances, their power consumption, and electric bill tracking are displayed. Additionally, the application makes use of APIs to access the internet, retrieving relevant 3D generative models, 360-degree videos, 2D images, and textual information based on user interactions with detected objects. Users can effortlessly explore and interact with the 3D generative models using intuitive hand gestures, providing an immersive experience without the need for additional hardware or dedicated VR headsets. Beyond home automation, the app offers valuable educational benefits, serving as a unique learning tool for students to gain hands-on experience. Medical practitioners can quickly reference organ anatomy and utilize its feature-rich functionalities. Its cost-effectiveness, requiring only installation, ensures accessibility to a wide audience. The app's functionality is both intuitive and efficient, detecting objects in the camera feed and prompting user interactions. Users can select objects through simple hand gestures, choosing desired content like 3D generative models, 2D images, textual information, 360-degree videos, or shopping-related details. The app then retrieves and overlays the requested information onto the real-world view in AR. In conclusion, this groundbreaking AR and IoT -powered app revolutionizes home automation and learning experiences, leveraging only a smartphone's camera, without the need for additional hardware or expensive installations. Its potential applications extend to education, industries, and health care, making it a versatile and valuable tool for a broad range of users. © 2023 IEEE.},
keywords = {2D, 3D, Application program interface, Application Program Interface (API), Application program interfaces, Application programming interfaces (API), Application programs, Augmented Reality, Augmented Reality(AR), Automation, Cameras, Cost effectiveness, Domestic appliances, GenAl, Internet of Things, Internet of Things (IoT) technologies, Internet of things technologies, Language processing, Natural Language Processing, Natural language processing systems, Natural languages, Object Detection, Object recognition, Objects detection, Optical character recognition, Optical Character Recognition (OCR), Smartphones},
pubstate = {published},
tppubtype = {inproceedings}
}
Vaidhyanathan, V.; Radhakrishnan, T. R.; López, J. L. G.
Spacify A Generative Framework for Spatial Comprehension, Articulation and Visualization using Large Language Models (LLMs) and eXtended Reality (XR) Proceedings Article
In: A., Crawford; N.M., Diniz; R., Beckett; J., Vanucchi; M., Swackhamer (Ed.): Habits Anthropocene: Scarcity Abundance Post-Mater. Econ. - Proc. Annu. Conf. Assoc. Comput. Aided Des. Archit., ACADIA, pp. 430–443, Association for Computer Aided Design in Architecture, 2023, ISBN: 979-898608059-8 (ISBN).
Abstract | Links | BibTeX | Tags: 3D data processing, 3D spaces, Architectural design, Built environment, C (programming language), Computational Linguistics, Computer aided design, Computer architecture, Data handling, Data users, Data visualization, Immersive media, Interior designers, Language Model, Natural languages, Spatial design, Three dimensional computer graphics, Urban designers, User interfaces, Visualization
@inproceedings{vaidhyanathan_spacify_2023,
title = {Spacify A Generative Framework for Spatial Comprehension, Articulation and Visualization using Large Language Models (LLMs) and eXtended Reality (XR)},
author = {V. Vaidhyanathan and T. R. Radhakrishnan and J. L. G. López},
editor = {Crawford A. and Diniz N.M. and Beckett R. and Vanucchi J. and Swackhamer M.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85192831586&partnerID=40&md5=996906de0f5ef1e6c88b10bb65caabc0},
isbn = {979-898608059-8 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Habits Anthropocene: Scarcity Abundance Post-Mater. Econ. - Proc. Annu. Conf. Assoc. Comput. Aided Des. Archit., ACADIA},
volume = {2},
pages = {430–443},
publisher = {Association for Computer Aided Design in Architecture},
abstract = {Spatial design, the thoughtful planning and creation of built environments, typically requires advanced technical knowledge and visuospatial skills, making it largely exclusive to professionals like architects, interior designers, and urban designers. This exclusivity limits non-experts' access to spatial design, despite their ability to describe requirements and suggestions in natural language. Recent advancements in generative artificial intelligence (AI), particularly large language models (LLMs), and extended reality, (XR) offer the potential to address this limitation. This paper introduces Spacify (Figure 1), a framework that utilizes the generalizing capabilities of LLMs, 3D data-processing, and XR interfaces to create an immersive medium for language-driven spatial understanding, design, and visualization for non-experts. This paper describes the five components of Spacify: External Data, User Input, Spatial Interface, Large Language Model, and Current Spatial Design; which enable the use of generative AI models in a) question/ answering about 3D spaces with reasoning, b) (re)generating 3D spatial designs with natural language prompts, and c) visualizing designed 3D spaces with natural language descriptions. An implementation of Spacify is demonstrated via an XR smartphone application, allowing for an end-to-end, language-driven interior design process. User survey results from non-experts redesigning their spaces in 3D using this application suggest that Spacify can make spatial design accessible using natural language prompts, thereby pioneering a new realm of spatial design that is naturally language-driven. © ACADIA 2023. All rights reserved.},
keywords = {3D data processing, 3D spaces, Architectural design, Built environment, C (programming language), Computational Linguistics, Computer aided design, Computer architecture, Data handling, Data users, Data visualization, Immersive media, Interior designers, Language Model, Natural languages, Spatial design, Three dimensional computer graphics, Urban designers, User interfaces, Visualization},
pubstate = {published},
tppubtype = {inproceedings}
}
DeChant, C.; Akinola, I.; Bauer, D.
Learning to summarize and answer questions about a virtual robot’s past actions Journal Article
In: Autonomous Robots, vol. 47, no. 8, pp. 1103–1118, 2023, ISSN: 09295593 (ISSN).
Abstract | Links | BibTeX | Tags: Action sequences, E-Learning, Interpretability, Language Model, Long horizon task, Long horizon tasks, Natural language processing systems, Natural languages, Question Answering, Representation learning, Robots, Summarization, Video frame, Virtual Reality, Virtual robots, Zero-shot learning
@article{dechant_learning_2023,
title = {Learning to summarize and answer questions about a virtual robot’s past actions},
author = {C. DeChant and I. Akinola and D. Bauer},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85176588341&doi=10.1007%2fs10514-023-10134-4&partnerID=40&md5=162b3343d5f000f2b79f59c339f99022},
doi = {10.1007/s10514-023-10134-4},
issn = {09295593 (ISSN)},
year = {2023},
date = {2023-01-01},
journal = {Autonomous Robots},
volume = {47},
number = {8},
pages = {1103–1118},
abstract = {When robots perform long action sequences, users will want to easily and reliably find out what they have done. We therefore demonstrate the task of learning to summarize and answer questions about a robot agent’s past actions using natural language alone. A single system with a large language model at its core is trained to both summarize and answer questions about action sequences given ego-centric video frames of a virtual robot and a question prompt. To enable training of question answering, we develop a method to automatically generate English-language questions and answers about objects, actions, and the temporal order in which actions occurred during episodes of robot action in the virtual environment. Training one model to both summarize and answer questions enables zero-shot transfer of representations of objects learned through question answering to improved action summarization. © 2023, The Author(s).},
keywords = {Action sequences, E-Learning, Interpretability, Language Model, Long horizon task, Long horizon tasks, Natural language processing systems, Natural languages, Question Answering, Representation learning, Robots, Summarization, Video frame, Virtual Reality, Virtual robots, Zero-shot learning},
pubstate = {published},
tppubtype = {article}
}
Le, M. -H.; Chu, C. -B.; Le, K. -D.; Nguyen, T. V.; Tran, M. -T.; Le, T. -N.
VIDES: Virtual Interior Design via Natural Language and Visual Guidance Proceedings Article
In: G., Bruder; A.H., Olivier; A., Cunningham; E.Y., Peng; J., Grubert; I., Williams (Ed.): Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct, pp. 689–694, Institute of Electrical and Electronics Engineers Inc., 2023, ISBN: 979-835032891-2 (ISBN).
Abstract | Links | BibTeX | Tags: Architectural design, Customisation, Cutting edge technology, Design concept, Design systems, Image editing, Image generation, Image generations, Indoor space, Interior Design, Interior designs, Interiors (building), Natural languages, Virtual Reality, Visual guidance, Visual languages
@inproceedings{le_vides_2023,
title = {VIDES: Virtual Interior Design via Natural Language and Visual Guidance},
author = {M. -H. Le and C. -B. Chu and K. -D. Le and T. V. Nguyen and M. -T. Tran and T. -N. Le},
editor = {Bruder G. and Olivier A.H. and Cunningham A. and Peng E.Y. and Grubert J. and Williams I.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85180376943&doi=10.1109%2fISMAR-Adjunct60411.2023.00148&partnerID=40&md5=5ce45d9e97fc5a9fdc31eb7514b3def3},
doi = {10.1109/ISMAR-Adjunct60411.2023.00148},
isbn = {979-835032891-2 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Proc. - IEEE Int. Symp. Mixed Augment. Real. Adjunct, ISMAR-Adjunct},
pages = {689–694},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Interior design is crucial in creating aesthetically pleasing and functional indoor spaces. However, developing and editing interior design concepts requires significant time and expertise. We propose Virtual Interior DESign (VIDES) system in response to this challenge. Leveraging cutting-edge technology in generative AI, our system can assist users in generating and editing indoor scene concepts quickly, given user text description and visual guidance. Using both visual guidance and language as the conditional inputs significantly enhances the accuracy and coherence of the generated scenes, resulting in visually appealing designs. Through extensive experimentation, we demonstrate the effectiveness of VIDES in developing new indoor concepts, changing indoor styles, and replacing and removing interior objects. The system successfully captures the essence of users' descriptions while providing flexibility for customization. Consequently, this system can potentially reduce the entry barrier for indoor design, making it more accessible to users with limited technical skills and reducing the time required to create high-quality images. Individuals who have a background in design can now easily communicate their ideas visually and effectively present their design concepts. © 2023 IEEE.},
keywords = {Architectural design, Customisation, Cutting edge technology, Design concept, Design systems, Image editing, Image generation, Image generations, Indoor space, Interior Design, Interior designs, Interiors (building), Natural languages, Virtual Reality, Visual guidance, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}