AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Lv, J.; Slowik, A.; Rani, S.; Kim, B. -G.; Chen, C. -M.; Kumari, S.; Li, K.; Lyu, X.; Jiang, H.
In: Research, vol. 8, 2025, ISSN: 20965168 (ISSN).
Abstract | Links | BibTeX | Tags: Adaptive fusion, Collaborative representations, Diagnosis, Electronic health record, Generative adversarial networks, Health care application, Healthcare environments, Immersive, Learning frameworks, Metaverses, Multi-modal, Multi-modal learning, Performance
@article{lv_multimodal_2025,
title = {Multimodal Metaverse Healthcare: A Collaborative Representation and Adaptive Fusion Approach for Generative Artificial-Intelligence-Driven Diagnosis},
author = {J. Lv and A. Slowik and S. Rani and B. -G. Kim and C. -M. Chen and S. Kumari and K. Li and X. Lyu and H. Jiang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-86000613924&doi=10.34133%2fresearch.0616&partnerID=40&md5=fdc8ae3b29db905105dada9a5657b54b},
doi = {10.34133/research.0616},
issn = {20965168 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {Research},
volume = {8},
abstract = {The metaverse enables immersive virtual healthcare environments, presenting opportunities for enhanced care delivery. A key challenge lies in effectively combining multimodal healthcare data and generative artificial intelligence abilities within metaverse-based healthcare applications, which is a problem that needs to be addressed. This paper proposes a novel multimodal learning framework for metaverse healthcare, MMLMH, based on collaborative intra- and intersample representation and adaptive fusion. Our framework introduces a collaborative representation learning approach that captures shared and modality-specific features across text, audio, and visual health data. By combining modality-specific and shared encoders with carefully formulated intrasample and intersample collaboration mechanisms, MMLMH achieves superior feature representation for complex health assessments. The framework’s adaptive fusion approach, utilizing attention mechanisms and gated neural networks, demonstrates robust performance across varying noise levels and data quality conditions. Experiments on metaverse healthcare datasets demonstrate MMLMH’s superior performance over baseline methods across multiple evaluation metrics. Longitudinal studies and visualization further illustrate MMLMH’s adaptability to evolving virtual environments and balanced performance across diagnostic accuracy, patient–system interaction efficacy, and data integration complexity. The proposed framework has a unique advantage in that a similar level of performance is maintained across various patient populations and virtual avatars, which could lead to greater personalization of healthcare experiences in the metaverse. MMLMH’s successful functioning in such complicated circumstances suggests that it can combine and process information streams from several sources. They can be successfully utilized in next-generation healthcare delivery through virtual reality. © 2025 Jianhui Lv et al.},
keywords = {Adaptive fusion, Collaborative representations, Diagnosis, Electronic health record, Generative adversarial networks, Health care application, Healthcare environments, Immersive, Learning frameworks, Metaverses, Multi-modal, Multi-modal learning, Performance},
pubstate = {published},
tppubtype = {article}
}
Song, T.; Liu, Z.; Zhao, R.; Fu, J.
ElderEase AR: Enhancing Elderly Daily Living with the Multimodal Large Language Model and Augmented Reality Proceedings Article
In: ICVRT - Proc. Int. Conf. Virtual Real. Technol., pp. 60–67, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071018-6 (ISBN).
Abstract | Links | BibTeX | Tags: Age-related, Assisted living, Augmented Reality, Augmented reality technology, Daily Life Support, Daily living, Daily-life supports, Elderly, Elderly users, Independent living, Independent living systems, Language Model, Modeling languages, Multi agent systems, Multi-modal, Multimodal large language model
@inproceedings{song_elderease_2025,
title = {ElderEase AR: Enhancing Elderly Daily Living with the Multimodal Large Language Model and Augmented Reality},
author = {T. Song and Z. Liu and R. Zhao and J. Fu},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001924899&doi=10.1145%2f3711496.3711505&partnerID=40&md5=4df693735547b505172657a73359f3ca},
doi = {10.1145/3711496.3711505},
isbn = {979-840071018-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {ICVRT - Proc. Int. Conf. Virtual Real. Technol.},
pages = {60–67},
publisher = {Association for Computing Machinery, Inc},
abstract = {Elderly individuals often face challenges in independent living due to age-related cognitive and physical decline. To address these issues, we propose an innovative Augmented Reality (AR) system, “ElderEase AR”, designed to assist elderly users in their daily lives by leveraging a Multimodal Large Language Model (MLLM). This system enables elderly users to capture images of their surroundings and ask related questions, providing context-aware feedback. We evaluated the system’s perceived ease-of-use and feasibility through a pilot study involving 30 elderly users, aiming to enhance their independence and quality of life. Our system integrates advanced AR technology with an intelligent agent trained on multimodal datasets. Through prompt engineering, the agent is tailored to respond in a manner that aligns with the speaking style of elderly users. Experimental results demonstrate high accuracy in object recognition and question answering, with positive feedback from user trials. Specifically, the system accurately identified objects in various environments and provided relevant answers to user queries. This study highlights the powerful potential of AR and AI technologies in creating support tools for the elderly. It suggests directions for future improvements and applications, such as enhancing the system’s adaptability to different user needs and expanding its functionality to cover more aspects of daily living. © 2024 Copyright held by the owner/author(s).},
keywords = {Age-related, Assisted living, Augmented Reality, Augmented reality technology, Daily Life Support, Daily living, Daily-life supports, Elderly, Elderly users, Independent living, Independent living systems, Language Model, Modeling languages, Multi agent systems, Multi-modal, Multimodal large language model},
pubstate = {published},
tppubtype = {inproceedings}
}
Coronado, A.; Carvalho, S. T.; Berretta, L.
See Through My Eyes: Using Multimodal Large Language Model for Describing Rendered Environments to Blind People Proceedings Article
In: IMX - Proc. ACM Int. Conf. Interact. Media Experiences, pp. 451–457, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071391-0 (ISBN).
Abstract | Links | BibTeX | Tags: Accessibility, Behavioral Research, Blind, Blind people, Helmet mounted displays, Human engineering, Human rehabilitation equipment, Interactive computer graphics, Interactive computer systems, Language Model, LLM, Multi-modal, Rendered environment, rendered environments, Spatial cognition, Virtual Reality, Vision aids, Visual impairment, Visual languages, Visually impaired people
@inproceedings{coronado_see_2025,
title = {See Through My Eyes: Using Multimodal Large Language Model for Describing Rendered Environments to Blind People},
author = {A. Coronado and S. T. Carvalho and L. Berretta},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007991842&doi=10.1145%2f3706370.3731641&partnerID=40&md5=2f7cb1535d39d5e59b1f43f773de3272},
doi = {10.1145/3706370.3731641},
isbn = {979-840071391-0 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {IMX - Proc. ACM Int. Conf. Interact. Media Experiences},
pages = {451–457},
publisher = {Association for Computing Machinery, Inc},
abstract = {Extended Reality (XR) is quickly expanding "as the next major technology wave in personal computing". Nevertheless, this expansion and adoption could also exclude certain disabled users, particularly people with visual impairment (VIP). According to the World Health Organization (WHO) in their 2019 publication, there were at least 2.2 billion people with visual impairment, a number that is also estimated to have increased in recent years. Therefore, it is important to include disabled users, especially visually impaired people, in the design of Head-Mounted Displays and Extended Reality environments. Indeed, this objective can be pursued by incorporating Multimodal Large Language Model (MLLM) technology, which can assist visually impaired people. As a case study, this study employs different prompts that result in environment descriptions from an MLLM integrated into a virtual reality (VR) escape room. Therefore, six potential prompts were engineered to generate valuable outputs for visually impaired users inside a VR environment. These outputs were evaluated using the G-Eval, and VIEScore metrics. Even though, the results show that the prompt patterns provided a description that aligns with the user's point of view, it is highly recommended to evaluate these outputs through "expected outputs"from Orientation and Mobility Specialists, and Sighted Guides. Furthermore, the subsequent step in the process is to evaluate these outputs by visually impaired people themselves to identify the most effective prompt pattern. © 2025 Copyright held by the owner/author(s).},
keywords = {Accessibility, Behavioral Research, Blind, Blind people, Helmet mounted displays, Human engineering, Human rehabilitation equipment, Interactive computer graphics, Interactive computer systems, Language Model, LLM, Multi-modal, Rendered environment, rendered environments, Spatial cognition, Virtual Reality, Vision aids, Visual impairment, Visual languages, Visually impaired people},
pubstate = {published},
tppubtype = {inproceedings}
}
Tong, Y.; Qiu, Y.; Li, R.; Qiu, S.; Heng, P. -A.
MS2Mesh-XR: Multi-Modal Sketch-to-Mesh Generation in XR Environments Proceedings Article
In: Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR, pp. 272–276, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833152157-8 (ISBN).
Abstract | Links | BibTeX | Tags: 3D meshes, 3D object, ControlNet, Hand-drawn sketches, Hands movement, High quality, Image-based, immersive visualization, Mesh generation, Multi-modal, Pipeline codes, Realistic images, Three dimensional computer graphics, Virtual environments, Virtual Reality
@inproceedings{tong_ms2mesh-xr_2025,
title = {MS2Mesh-XR: Multi-Modal Sketch-to-Mesh Generation in XR Environments},
author = {Y. Tong and Y. Qiu and R. Li and S. Qiu and P. -A. Heng},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105000423684&doi=10.1109%2fAIxVR63409.2025.00052&partnerID=40&md5=caeace6850dcbdf8c1fa0441b98fa8d9},
doi = {10.1109/AIxVR63409.2025.00052},
isbn = {979-833152157-8 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Int. Conf. Artif. Intell. Ext. Virtual Real., AIxVR},
pages = {272–276},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {We present MS2Mesh-XR, a novel multimodal sketch-to-mesh generation pipeline that enables users to create realistic 3D objects in extended reality (XR) environments using hand-drawn sketches assisted by voice inputs. In specific, users can intuitively sketch objects using natural hand movements in mid-air within a virtual environment. By integrating voice inputs, we devise ControlNet to infer realistic images based on the drawn sketches and interpreted text prompts. Users can then review and select their preferred image, which is subsequently reconstructed into a detailed 3D mesh using the Convolutional Reconstruction Model. In particular, our proposed pipeline can generate a high-quality 3D mesh in less than 20 seconds, allowing for immersive visualization and manipulation in runtime XR scenes. We demonstrate the practicability of our pipeline through two use cases in XR settings. By leveraging natural user inputs and cutting-edge generative AI capabilities, our approach can significantly facilitate XR-based creative production and enhance user experiences. Our code and demo will be available at: https://yueqiu0911.github.io/MS2Mesh-XR/. © 2025 IEEE.},
keywords = {3D meshes, 3D object, ControlNet, Hand-drawn sketches, Hands movement, High quality, Image-based, immersive visualization, Mesh generation, Multi-modal, Pipeline codes, Realistic images, Three dimensional computer graphics, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Kim, Y.; Aamir, Z.; Singh, M.; Boorboor, S.; Mueller, K.; Kaufman, A. E.
Explainable XR: Understanding User Behaviors of XR Environments Using LLM-Assisted Analytics Framework Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2756–2766, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: adult, Agnostic, Article, Assistive, Cross Reality, Data Analytics, Data collection, data interpretation, Data recording, Data visualization, Extended reality, human, Language Model, Large language model, large language models, Multi-modal, Multimodal Data Collection, normal human, Personalized assistive technique, Personalized Assistive Techniques, recorder, Spatio-temporal data, therapy, user behavior, User behaviors, Virtual addresses, Virtual environments, Virtual Reality, Visual analytics, Visual languages
@article{kim_explainable_2025,
title = {Explainable XR: Understanding User Behaviors of XR Environments Using LLM-Assisted Analytics Framework},
author = {Y. Kim and Z. Aamir and M. Singh and S. Boorboor and K. Mueller and A. E. Kaufman},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003815583&doi=10.1109%2fTVCG.2025.3549537&partnerID=40&md5=1085b698db06656985f80418cb37b773},
doi = {10.1109/TVCG.2025.3549537},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2756–2766},
abstract = {We present Explainable XR, an end-to-end framework for analyzing user behavior in diverse eXtended Reality (XR) environments by leveraging Large Language Models (LLMs) for data interpretation assistance. Existing XR user analytics frameworks face challenges in handling cross-virtuality - AR, VR, MR - transitions, multi-user collaborative application scenarios, and the complexity of multimodal data. Explainable XR addresses these challenges by providing a virtuality-agnostic solution for the collection, analysis, and visualization of immersive sessions. We propose three main components in our framework: (1) A novel user data recording schema, called User Action Descriptor (UAD), that can capture the users' multimodal actions, along with their intents and the contexts; (2) a platform-agnostic XR session recorder, and (3) a visual analytics interface that offers LLM-assisted insights tailored to the analysts' perspectives, facilitating the exploration and analysis of the recorded XR session data. We demonstrate the versatility of Explainable XR by demonstrating five use-case scenarios, in both individual and collaborative XR applications across virtualities. Our technical evaluation and user studies show that Explainable XR provides a highly usable analytics solution for understanding user actions and delivering multifaceted, actionable insights into user behaviors in immersive environments. © 1995-2012 IEEE.},
keywords = {adult, Agnostic, Article, Assistive, Cross Reality, Data Analytics, Data collection, data interpretation, Data recording, Data visualization, Extended reality, human, Language Model, Large language model, large language models, Multi-modal, Multimodal Data Collection, normal human, Personalized assistive technique, Personalized Assistive Techniques, recorder, Spatio-temporal data, therapy, user behavior, User behaviors, Virtual addresses, Virtual environments, Virtual Reality, Visual analytics, Visual languages},
pubstate = {published},
tppubtype = {article}
}
Stacchio, L.; Balloni, E.; Frontoni, E.; Paolanti, M.; Zingaretti, P.; Pierdicca, R.
MineVRA: Exploring the Role of Generative AI-Driven Content Development in XR Environments through a Context-Aware Approach Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 3602–3612, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: adult, Article, Artificial intelligence, Computer graphics, Computer vision, Content Development, Contents development, Context-Aware, Context-aware approaches, Extended reality, female, Generative adversarial networks, Generative AI, generative artificial intelligence, human, Human-in-the-loop, Immersive, Immersive environment, male, Multi-modal, User need, Virtual environments, Virtual Reality
@article{stacchio_minevra_2025,
title = {MineVRA: Exploring the Role of Generative AI-Driven Content Development in XR Environments through a Context-Aware Approach},
author = {L. Stacchio and E. Balloni and E. Frontoni and M. Paolanti and P. Zingaretti and R. Pierdicca},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003746367&doi=10.1109%2fTVCG.2025.3549160&partnerID=40&md5=70b162b574eebbb0cb71db871aa787e1},
doi = {10.1109/TVCG.2025.3549160},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {3602–3612},
abstract = {The convergence of Artificial Intelligence (AI), Computer Vision (CV), Computer Graphics (CG), and Extended Reality (XR) is driving innovation in immersive environments. A key challenge in these environments is the creation of personalized 3D assets, traditionally achieved through manual modeling, a time-consuming process that often fails to meet individual user needs. More recently, Generative AI (GenAI) has emerged as a promising solution for automated, context-aware content generation. In this paper, we present MineVRA (Multimodal generative artificial iNtelligence for contExt-aware Virtual Reality Assets), a novel Human-In-The-Loop (HITL) XR framework that integrates GenAI to facilitate coherent and adaptive 3D content generation in immersive scenarios. To evaluate the effectiveness of this approach, we conducted a comparative user study analyzing the performance and user satisfaction of GenAI-generated 3D objects compared to those generated by Sketchfab in different immersive contexts. The results suggest that GenAI can significantly complement traditional 3D asset libraries, with valuable design implications for the development of human-centered XR environments. © 1995-2012 IEEE.},
keywords = {adult, Article, Artificial intelligence, Computer graphics, Computer vision, Content Development, Contents development, Context-Aware, Context-aware approaches, Extended reality, female, Generative adversarial networks, Generative AI, generative artificial intelligence, human, Human-in-the-loop, Immersive, Immersive environment, male, Multi-modal, User need, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {article}
}
2024
Weerasinghe, K.; Janapati, S.; Ge, X.; Kim, S.; Iyer, S.; Stankovic, J. A.; Alemzadeh, H.
Real-Time Multimodal Cognitive Assistant for Emergency Medical Services Proceedings Article
In: Proc. - ACM/IEEE Conf. Internet-of-Things Des. Implement., IoTDI, pp. 85–96, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835037025-6 (ISBN).
Abstract | Links | BibTeX | Tags: Artificial intelligence, Augmented Reality, Cognitive Assistance, Computational Linguistics, Decision making, Domain knowledge, Edge computing, Emergency medical services, Forecasting, Graphic methods, Language Model, machine learning, Machine-learning, Multi-modal, Real- time, Service protocols, Smart Health, Speech recognition, State of the art
@inproceedings{weerasinghe_real-time_2024,
title = {Real-Time Multimodal Cognitive Assistant for Emergency Medical Services},
author = {K. Weerasinghe and S. Janapati and X. Ge and S. Kim and S. Iyer and J. A. Stankovic and H. Alemzadeh},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85197769304&doi=10.1109%2fIoTDI61053.2024.00012&partnerID=40&md5=a3b7cf14e46ecb2d4e49905fb845f2c9},
doi = {10.1109/IoTDI61053.2024.00012},
isbn = {979-835037025-6 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - ACM/IEEE Conf. Internet-of-Things Des. Implement., IoTDI},
pages = {85–96},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Emergency Medical Services (EMS) responders often operate under time-sensitive conditions, facing cognitive overload and inherent risks, requiring essential skills in critical thinking and rapid decision-making. This paper presents CognitiveEMS, an end-to-end wearable cognitive assistant system that can act as a collaborative virtual partner engaging in the real-time acquisition and analysis of multimodal data from an emergency scene and interacting with EMS responders through Augmented Reality (AR) smart glasses. CognitiveEMS processes the continuous streams of data in real-time and leverages edge computing to provide assistance in EMS protocol selection and intervention recognition. We address key technical challenges in real-time cognitive assistance by introducing three novel components: (i) a Speech Recognition model that is fine-tuned for real-world medical emergency conversations using simulated EMS audio recordings, augmented with synthetic data generated by large language models (LLMs); (ii) an EMS Protocol Prediction model that combines state-of-the-art (SOTA) tiny language models with EMS domain knowledge using graph-based attention mechanisms; (iii) an EMS Action Recognition module which leverages multimodal audio and video data and protocol predictions to infer the intervention/treatment actions taken by the responders at the incident scene. Our results show that for speech recognition we achieve superior performance compared to SOTA (WER of 0.290 vs. 0.618) on conversational data. Our protocol prediction component also significantly outperforms SOTA (top-3 accuracy of 0.800 vs. 0.200) and the action recognition achieves an accuracy of 0.727, while maintaining an end-to-end latency of 3.78s for protocol prediction on the edge and 0.31s on the server. © 2024 IEEE.},
keywords = {Artificial intelligence, Augmented Reality, Cognitive Assistance, Computational Linguistics, Decision making, Domain knowledge, Edge computing, Emergency medical services, Forecasting, Graphic methods, Language Model, machine learning, Machine-learning, Multi-modal, Real- time, Service protocols, Smart Health, Speech recognition, State of the art},
pubstate = {published},
tppubtype = {inproceedings}
}
He, Z.; Li, S.; Song, Y.; Cai, Z.
Towards Building Condition-Based Cross-Modality Intention-Aware Human-AI Cooperation under VR Environment Proceedings Article
In: Conf Hum Fact Comput Syst Proc, Association for Computing Machinery, 2024, ISBN: 979-840070330-0 (ISBN).
Abstract | Links | BibTeX | Tags: Action Generation, Building conditions, Condition, Critical challenges, Cross modality, Human-AI Cooperation, Information presentation, Intention Detection, Language Model, Multi-modal, Purchasing, User interfaces, Virtual Reality
@inproceedings{he_towards_2024,
title = {Towards Building Condition-Based Cross-Modality Intention-Aware Human-AI Cooperation under VR Environment},
author = {Z. He and S. Li and Y. Song and Z. Cai},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85194829231&doi=10.1145%2f3613904.3642360&partnerID=40&md5=44d237a6e2a686af74ffb684ef887ab6},
doi = {10.1145/3613904.3642360},
isbn = {979-840070330-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Conf Hum Fact Comput Syst Proc},
publisher = {Association for Computing Machinery},
abstract = {To address critical challenges in effectively identifying user intent and forming relevant information presentations and recommendations in VR environments, we propose an innovative condition-based multi-modal human-AI cooperation framework. It highlights the intent tuples (intent, condition, intent prompt, action prompt) and 2-Large-Language-Models (2-LLMs) architecture. This design, utilizes “condition” as the core to describe tasks, dynamically match user interactions with intentions, and empower generations of various tailored multi-modal AI responses. The architecture of 2-LLMs separates the roles of intent detection and action generation, decreasing the prompt length and helping with generating appropriate responses. We implemented a VR-based intelligent furniture purchasing system based on the proposed framework and conducted a three-phase comparative user study. The results conclusively demonstrate the system's superiority in time efficiency and accuracy, intention conveyance improvements, effective product acquisitions, and user satisfaction and cooperation preference. Our framework provides a promising approach towards personalized and efficient user experiences in VR. © 2024 Copyright held by the owner/author(s)},
keywords = {Action Generation, Building conditions, Condition, Critical challenges, Cross modality, Human-AI Cooperation, Information presentation, Intention Detection, Language Model, Multi-modal, Purchasing, User interfaces, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Lee, S.; Park, W.; Lee, K.
Building Knowledge Base of 3D Object Assets Using Multimodal LLM AI Model Proceedings Article
In: Int. Conf. ICT Convergence, pp. 416–418, IEEE Computer Society, 2024, ISBN: 21621233 (ISSN); 979-835036463-7 (ISBN).
Abstract | Links | BibTeX | Tags: 3D object, Asset management, Content services, Exponentials, Information Management, Knowledge Base, Language Model, Large language model, LLM, Multi-modal, Multi-Modal AI, Reusability, Visual effects, XR
@inproceedings{lee_building_2024,
title = {Building Knowledge Base of 3D Object Assets Using Multimodal LLM AI Model},
author = {S. Lee and W. Park and K. Lee},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85217636269&doi=10.1109%2fICTC62082.2024.10827434&partnerID=40&md5=581ee8ca50eb3dae15dc9675971cf428},
doi = {10.1109/ICTC62082.2024.10827434},
isbn = {21621233 (ISSN); 979-835036463-7 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Int. Conf. ICT Convergence},
pages = {416–418},
publisher = {IEEE Computer Society},
abstract = {The proliferation of various XR (eXtended Reality) services and the increasing incorporation of visual effects into existing content services have led to an exponential rise in the demand for 3D object assets. This paper describes an LLM (Large Language Model)-based multimodal AI model pipeline that can be applied to a generative AI model for creating new 3D objects or restructuring the asset management system to enhance the reusability of existing 3D objects. By leveraging a multimodal AI model, we derived descriptive text for assets such as 3D object, 2D image at a human-perceptible level, rather than mere data, and subsequently used an LLM to generate knowledge triplets for constructing an asset knowledge base. The applicability of this pipeline was verified using actual 3D objects from a content production company. Future work will focus on improving the quality of the generated knowledge triplets themselves by training the multimodal AI model with real-world content usage assets. © 2024 IEEE.},
keywords = {3D object, Asset management, Content services, Exponentials, Information Management, Knowledge Base, Language Model, Large language model, LLM, Multi-modal, Multi-Modal AI, Reusability, Visual effects, XR},
pubstate = {published},
tppubtype = {inproceedings}
}
Xie, W.; Liu, Y.; Wang, K.; Wang, M.
LLM-Guided Cross-Modal Point Cloud Quality Assessment: A Graph Learning Approach Journal Article
In: IEEE Signal Processing Letters, vol. 31, pp. 2250–2254, 2024, ISSN: 10709908 (ISSN).
Abstract | Links | BibTeX | Tags: 3D reconstruction, Cross-modal, Language Model, Large language model, Learning approach, Multi-modal, Multimodal quality assessment, Point cloud quality assessment, Point-clouds, Quality assessment
@article{xie_llm-guided_2024,
title = {LLM-Guided Cross-Modal Point Cloud Quality Assessment: A Graph Learning Approach},
author = {W. Xie and Y. Liu and K. Wang and M. Wang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85203417746&doi=10.1109%2fLSP.2024.3452556&partnerID=40&md5=88460ec3043fa9161c4d5dd6fc282f95},
doi = {10.1109/LSP.2024.3452556},
issn = {10709908 (ISSN)},
year = {2024},
date = {2024-01-01},
journal = {IEEE Signal Processing Letters},
volume = {31},
pages = {2250–2254},
abstract = {This paper addresses the critical need for accurate and reliable point cloud quality assessment (PCQA) in various applications, such as autonomous driving, robotics, virtual reality, and 3D reconstruction. To meet this need, we propose a large language model (LLM)-guided PCQA approach based on graph learning. Specifically, we first utilize the LLM to generate quality description texts for each 3D object, and employ two CLIP-like feature encoders to represent the image and text modalities. Next, we design a latent feature enhancer module to improve contrastive learning, enabling more effective alignment performance. Finally, we develop a graph network fusion module that utilizes a ranking-based loss to adjust the relationship of different nodes, which explicitly considers both modality fusion and quality ranking. Experimental results on three benchmark datasets demonstrate the effectiveness and superiority of our approach over 12 representative PCQA methods, which demonstrate the potential of multi-modal learning, the importance of latent feature enhancement, and the significance of graph-based fusion in advancing the field of PCQA. © 2024 IEEE.},
keywords = {3D reconstruction, Cross-modal, Language Model, Large language model, Learning approach, Multi-modal, Multimodal quality assessment, Point cloud quality assessment, Point-clouds, Quality assessment},
pubstate = {published},
tppubtype = {article}
}
Villalobos, W.; Kumar, Y.; Li, J. J.
The Multilingual Eyes Multimodal Traveler’s App Proceedings Article
In: X.-S., Yang; S., Sherratt; N., Dey; A., Joshi (Ed.): Lect. Notes Networks Syst., pp. 565–575, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 23673370 (ISSN); 978-981973304-0 (ISBN).
Abstract | Links | BibTeX | Tags: AI in travel, Artificial intelligence in travel, Assistive navigation technologies, Assistive navigation technology, Assistive navigations, Human-AI interaction in tourism, Human-artificial intelligence interaction in tourism, Language Model, Military applications, Military operations, Multi-modal, Multilingual translations, Multimodal large language model, Multimodal LLMs, Navigation technology, Real- time, Real-time multilingual translation, Robots, Virtual Reality
@inproceedings{villalobos_multilingual_2024,
title = {The Multilingual Eyes Multimodal Traveler’s App},
author = {W. Villalobos and Y. Kumar and J. J. Li},
editor = {Yang X.-S. and Sherratt S. and Dey N. and Joshi A.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85201104509&doi=10.1007%2f978-981-97-3305-7_45&partnerID=40&md5=91f94aa091c97ec3ad251e07b47fa06e},
doi = {10.1007/978-981-97-3305-7_45},
isbn = {23673370 (ISSN); 978-981973304-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Lect. Notes Networks Syst.},
volume = {1004 LNNS},
pages = {565–575},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {This paper presents an in-depth analysis of “The Multilingual Eyes Multimodal Traveler’s App” (MEMTA), a novel application in the realm of travel technology, leveraging advanced Artificial Intelligence (AI) capabilities. The core of MEMTA’s innovation lies in its integration of multimodal Large Language Models (LLMs), notably ChatGPT-4-Vision, to enhance navigational assistance and situational awareness for tourists and visually impaired individuals in diverse environments. The study rigorously evaluates how the incorporation of OpenAI’s Whisper and DALL-E 3 technologies augments the app’s proficiency in real-time, multilingual translation, pronunciation, and visual content generation, thereby significantly improving the user experience in various geographical settings. A key focus is placed on the development and impact of a custom GPT model, Susanin, designed specifically for the app, highlighting its advancements in Human-AI interaction and accessibility over standard LLMs. The paper thoroughly explores the practical applications of MEMTA, extending its utility beyond mere travel assistance to sectors such as robotics, virtual reality, and military operations, thus underscoring its multifaceted significance. Through this exploration, the study contributes novel insights into the fields of AI-enhanced travel, assistive technologies, and the broader scope of human-AI interaction. © The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2024.},
keywords = {AI in travel, Artificial intelligence in travel, Assistive navigation technologies, Assistive navigation technology, Assistive navigations, Human-AI interaction in tourism, Human-artificial intelligence interaction in tourism, Language Model, Military applications, Military operations, Multi-modal, Multilingual translations, Multimodal large language model, Multimodal LLMs, Navigation technology, Real- time, Real-time multilingual translation, Robots, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Manesh, S. A.; Zhang, T.; Onishi, Y.; Hara, K.; Bateman, S.; Li, J.; Tang, A.
How People Prompt Generative AI to Create Interactive VR Scenes Proceedings Article
In: A., Vallgarda; L., Jonsson; J., Fritsch; S.F., Alaoui; C.A., Le Dantec (Ed.): Proc. ACM Des. Interact. Syst. Conf., pp. 2319–2340, Association for Computing Machinery, Inc, 2024, ISBN: 979-840070583-0 (ISBN).
Abstract | Links | BibTeX | Tags: Embodied interaction, Embodied knowledge, Embodied prompting, Generative AI, Interactive virtual reality, Multi-modal, Natural languages, Programming agents, Prompting, User interfaces, Virtual Reality, Wizard of Oz
@inproceedings{manesh_how_2024,
title = {How People Prompt Generative AI to Create Interactive VR Scenes},
author = {S. A. Manesh and T. Zhang and Y. Onishi and K. Hara and S. Bateman and J. Li and A. Tang},
editor = {Vallgarda A. and Jonsson L. and Fritsch J. and Alaoui S.F. and Le Dantec C.A.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85200348302&doi=10.1145%2f3643834.3661547&partnerID=40&md5=11831bb65214fd75905ccdaeb8356cdf},
doi = {10.1145/3643834.3661547},
isbn = {979-840070583-0 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. ACM Des. Interact. Syst. Conf.},
pages = {2319–2340},
publisher = {Association for Computing Machinery, Inc},
abstract = {Generative AI tools can provide people with the ability to create virtual environments and scenes with natural language prompts. Yet, how people will formulate such prompts is unclear—particularly when they inhabit the environment that they are designing. For instance, it is likely that a person might say, “Put a chair here,” while pointing at a location. If such linguistic and embodied features are common to people’s prompts, we need to tune models to accommodate them. In this work, we present a Wizard of Oz elicitation study with 22 participants, where we studied people’s implicit expectations when verbally prompting such programming agents to create interactive VR scenes. Our fndings show when people prompted the agent, they had several implicit expectations of these agents: (1) they should have an embodied knowledge of the environment; (2) they should understand embodied prompts by users; (3) they should recall previous states of the scene and the conversation, and that (4) they should have a commonsense understanding of objects in the scene. Further, we found that participants prompted diferently when they were prompting in situ (i.e. within the VR environment) versus ex situ (i.e. viewing the VR environment from the outside). To explore how these lessons could be applied, we designed and built Ostaad, a conversational programming agent that allows non-programmers to design interactive VR experiences that they inhabit. Based on these explorations, we outline new opportunities and challenges for conversational programming agents that create VR environments. © 2024 Copyright held by the owner/author(s).},
keywords = {Embodied interaction, Embodied knowledge, Embodied prompting, Generative AI, Interactive virtual reality, Multi-modal, Natural languages, Programming agents, Prompting, User interfaces, Virtual Reality, Wizard of Oz},
pubstate = {published},
tppubtype = {inproceedings}
}
Behravan, M.; Gracanin, D.
Generative Multi-Modal Artificial Intelligence for Dynamic Real-Time Context-Aware Content Creation in Augmented Reality Proceedings Article
In: S.N., Spencer (Ed.): Proc. ACM Symp. Virtual Reality Softw. Technol. VRST, Association for Computing Machinery, 2024, ISBN: 979-840070535-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D object, 3D Object Generation, Augmented Reality, Content creation, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Language Model, Multi-modal, Real- time, Time contexts, Vision language model, vision language models, Visual languages
@inproceedings{behravan_generative_2024,
title = {Generative Multi-Modal Artificial Intelligence for Dynamic Real-Time Context-Aware Content Creation in Augmented Reality},
author = {M. Behravan and D. Gracanin},
editor = {Spencer S.N.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85212524068&doi=10.1145%2f3641825.3689685&partnerID=40&md5=daf8aa8960d9dd4dbdbf67ccb1e7fb83},
doi = {10.1145/3641825.3689685},
isbn = {979-840070535-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. ACM Symp. Virtual Reality Softw. Technol. VRST},
publisher = {Association for Computing Machinery},
abstract = {We introduce a framework that uses generative Artificial Intelligence (AI) for dynamic and context-aware content creation in Augmented Reality (AR). By integrating Vision Language Models (VLMs), our system detects and understands the physical space around the user, recommending contextually relevant objects. These objects are transformed into 3D models using a text-to-3D generative AI techniques, allowing for real-time content inclusion within the AR space. This approach enhances user experience by enabling intuitive customization through spoken commands, while reducing costs and improving accessibility to advanced AR interactions. The framework's vision and language capabilities support the generation of comprehensive and context-specific 3D objects. © 2024 Owner/Author.},
keywords = {3D object, 3D Object Generation, Augmented Reality, Content creation, Context-Aware, Generative adversarial networks, Generative AI, generative artificial intelligence, Language Model, Multi-modal, Real- time, Time contexts, Vision language model, vision language models, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Xu, S.; Wei, Y.; Zheng, P.; Zhang, J.; Yu, C.
LLM enabled generative collaborative design in a mixed reality environment Journal Article
In: Journal of Manufacturing Systems, vol. 74, pp. 703–715, 2024, ISSN: 02786125 (ISSN).
Abstract | Links | BibTeX | Tags: Collaborative design, Collaborative design process, Communication barriers, Computational Linguistics, design, Design frameworks, generative artificial intelligence, Iterative methods, Language Model, Large language model, Mixed reality, Mixed-reality environment, Multi-modal, Visual languages
@article{xu_llm_2024,
title = {LLM enabled generative collaborative design in a mixed reality environment},
author = {S. Xu and Y. Wei and P. Zheng and J. Zhang and C. Yu},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85192244873&doi=10.1016%2fj.jmsy.2024.04.030&partnerID=40&md5=3f050c429cf5a4120d10a432311f46cb},
doi = {10.1016/j.jmsy.2024.04.030},
issn = {02786125 (ISSN)},
year = {2024},
date = {2024-01-01},
journal = {Journal of Manufacturing Systems},
volume = {74},
pages = {703–715},
abstract = {In the collaborative design process, diverse stakeholder backgrounds often introduce inefficiencies in collaboration, such as delays in design delivery and decreased creativity, primarily due to misunderstandings and communication barriers caused by this diversity. To respond, this study proposes an AI-augmented Multimodal Collaborative Design (AI-MCD) framework. This framework utilizes Large Language Models (LLM) to establish an iterative prompting mechanism that provides professional design prompts for Generative AI (GAI) to generate precise visual schemes. On this basis, the GAI cooperates with Mixed Reality (MR) technology to form an interactive and immersive environment for enabling full participation in the design process. By integrating these technologies, the study aims to help stakeholders form a unified cognition and optimize the traditional collaborative design process. Through a case study involving the development of heart education products for children, the effectiveness of the framework is emphasized, and the practical application and effectiveness of the proposed method innovation are demonstrated. © 2024 The Society of Manufacturing Engineers},
keywords = {Collaborative design, Collaborative design process, Communication barriers, Computational Linguistics, design, Design frameworks, generative artificial intelligence, Iterative methods, Language Model, Large language model, Mixed reality, Mixed-reality environment, Multi-modal, Visual languages},
pubstate = {published},
tppubtype = {article}
}
de Oliveira, E. A. Masasi; Silva, D. F. C.; Filho, A. R. G.
Improving VR Accessibility Through Automatic 360 Scene Description Using Multimodal Large Language Models Proceedings Article
In: ACM Int. Conf. Proc. Ser., pp. 289–293, Association for Computing Machinery, 2024, ISBN: 979-840070979-1 (ISBN).
Abstract | Links | BibTeX | Tags: 3D Scene, 3D scenes, Accessibility, Computer simulation languages, Descriptive information, Digital elevation model, Immersive, Language Model, Multi-modal, Multimodal large language model, Multimodal Large Language Models (MLLMs), Scene description, Virtual environments, Virtual Reality, Virtual Reality (VR), Virtual reality technology
@inproceedings{masasi_de_oliveira_improving_2024,
title = {Improving VR Accessibility Through Automatic 360 Scene Description Using Multimodal Large Language Models},
author = {E. A. Masasi de Oliveira and D. F. C. Silva and A. R. G. Filho},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85206580797&doi=10.1145%2f3691573.3691619&partnerID=40&md5=6e80800fce0e6b56679fbcbe982bcfa7},
doi = {10.1145/3691573.3691619},
isbn = {979-840070979-1 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {ACM Int. Conf. Proc. Ser.},
pages = {289–293},
publisher = {Association for Computing Machinery},
abstract = {Advancements in Virtual Reality (VR) technology hold immense promise for enriching immersive experiences. Despite the advancements in VR technology, there remains a significant gap in addressing accessibility concerns, particularly in automatically providing descriptive information for VR scenes. This paper combines the potential of leveraging Multimodal Large Language Models (MLLMs) to automatically generate text descriptions for 360 VR scenes according to Speech-to-Text (STT) prompts. As a case study, we conduct experiments on educational settings in VR museums, improving dynamic experiences across various contexts. Despite minor challenges in adapting MLLMs to VR Scenes, the experiments demonstrate that they can generate descriptions with high quality. Our findings provide insights for enhancing VR experiences and ensuring accessibility to individuals with disabilities or diverse needs. © 2024 Copyright held by the owner/author(s).},
keywords = {3D Scene, 3D scenes, Accessibility, Computer simulation languages, Descriptive information, Digital elevation model, Immersive, Language Model, Multi-modal, Multimodal large language model, Multimodal Large Language Models (MLLMs), Scene description, Virtual environments, Virtual Reality, Virtual Reality (VR), Virtual reality technology},
pubstate = {published},
tppubtype = {inproceedings}
}
Leong, C. W.; Jawahar, N.; Basheerabad, V.; Wörtwein, T.; Emerson, A.; Sivan, G.
Combining Generative and Discriminative AI for High-Stakes Interview Practice Proceedings Article
In: ACM Int. Conf. Proc. Ser., pp. 94–96, Association for Computing Machinery, 2024, ISBN: 979-840070463-5 (ISBN).
Abstract | Links | BibTeX | Tags: AI systems, College admissions, Continuous improvements, End to end, Interactive computer graphics, Interactive dialog system, interactive dialogue systems, Language Model, Modeling languages, Multi-modal, Multimodal computing, Video interview, video interviews, Virtual avatar, Virtual environments, Virtual Reality
@inproceedings{leong_combining_2024,
title = {Combining Generative and Discriminative AI for High-Stakes Interview Practice},
author = {C. W. Leong and N. Jawahar and V. Basheerabad and T. Wörtwein and A. Emerson and G. Sivan},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85211135262&doi=10.1145%2f3686215.3688377&partnerID=40&md5=4f53f4466d43840510a36c125eeefa16},
doi = {10.1145/3686215.3688377},
isbn = {979-840070463-5 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {ACM Int. Conf. Proc. Ser.},
pages = {94–96},
publisher = {Association for Computing Machinery},
abstract = {We present a demo comprising an end-to-end AI pipeline for practicing video interviews for a high-stakes scenarios (i.e., college admissions) with personalized, actionable feedback for continuous improvement of the user. This system provides personalized, actionable feedback for continuous user improvement. Utilizing large language models (LLMs), we generate questions and responses for a virtual avatar interviewer. Our focus on key qualities - such as concise responses with low latency, empathy, and smooth topic navigation - led to a comparative evaluation of several prominent LLMs, each undergoing evolutionary development. We also discuss the integration of avatar technology to create an immersive, virtual environment for naturalistic dyadic conversations. © 2024 Owner/Author.},
keywords = {AI systems, College admissions, Continuous improvements, End to end, Interactive computer graphics, Interactive dialog system, interactive dialogue systems, Language Model, Modeling languages, Multi-modal, Multimodal computing, Video interview, video interviews, Virtual avatar, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Lee, S.; Lee, H.; Lee, K.
Knowledge Generation Pipeline using LLM for Building 3D Object Knowledge Base Proceedings Article
In: Int. Conf. ICT Convergence, pp. 1303–1305, IEEE Computer Society, 2023, ISBN: 21621233 (ISSN); 979-835031327-7 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D models, 3D object, 3d-modeling, Augmented Reality, Data Mining, Knowledge Base, Knowledge based systems, Knowledge generations, Language Model, Metaverse, Metaverses, Multi-modal, MultiModal AI, Multimodal artificial intelligence, Pipelines, Virtual Reality, XR
@inproceedings{lee_knowledge_2023,
title = {Knowledge Generation Pipeline using LLM for Building 3D Object Knowledge Base},
author = {S. Lee and H. Lee and K. Lee},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85184593202&doi=10.1109%2fICTC58733.2023.10392933&partnerID=40&md5=b877638607a04e5a31a2d5723af6e11b},
doi = {10.1109/ICTC58733.2023.10392933},
isbn = {21621233 (ISSN); 979-835031327-7 (ISBN)},
year = {2023},
date = {2023-01-01},
booktitle = {Int. Conf. ICT Convergence},
pages = {1303–1305},
publisher = {IEEE Computer Society},
abstract = {With the wide spread of XR(eXtended Reality) contents such as Metaverse and VR(Virtual Reality) / AR(Augmented Reality), the utilization and importance of 3D objects are increasing. In this paper, we describe a knowledge generation pipeline of 3D object for reuse of existing 3D objects and production of new 3D object using generative AI(Artificial Intelligence). 3D object knowledge includes not only the object itself data that are generated in object editing phase but the information for human to recognize and understand objects. The target 3D model for building knowledge is the space model of office for business Metaverse service and the model of objects composing the space. LLM(Large Language Model)-based multimodal AI was used to extract knowledge from 3D model in a systematic and automated way. We plan to expand the pipeline to utilize knowledge base for managing extracted knowledge and correcting errors occurred during the LLM process for the knowledge extraction. © 2023 IEEE.},
keywords = {3D modeling, 3D models, 3D object, 3d-modeling, Augmented Reality, Data Mining, Knowledge Base, Knowledge based systems, Knowledge generations, Language Model, Metaverse, Metaverses, Multi-modal, MultiModal AI, Multimodal artificial intelligence, Pipelines, Virtual Reality, XR},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Wang, A.; Gao, Z.; Lee, L. H.; Braud, T.; Hui, P.
Decentralized, not Dehumanized in the Metaverse: Bringing Utility to NFTs through Multimodal Interaction Proceedings Article
In: ACM Int. Conf. Proc. Ser., pp. 662–667, Association for Computing Machinery, 2022, ISBN: 978-145039390-4 (ISBN).
Abstract | Links | BibTeX | Tags: AI-generated art, Arts computing, Behavioral Research, Computation theory, Continuum mechanics, Decentralised, Human behaviors, Interaction, Multi-modal, multimodal, Multimodal Interaction, NFTs, Non-fungible token, Text-to-image, The metaverse
@inproceedings{wang_decentralized_2022,
title = {Decentralized, not Dehumanized in the Metaverse: Bringing Utility to NFTs through Multimodal Interaction},
author = {A. Wang and Z. Gao and L. H. Lee and T. Braud and P. Hui},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85142799074&doi=10.1145%2f3536221.3558176&partnerID=40&md5=f9dee1e9e60afc71c4533cbdee0b98a7},
doi = {10.1145/3536221.3558176},
isbn = {978-145039390-4 (ISBN)},
year = {2022},
date = {2022-01-01},
booktitle = {ACM Int. Conf. Proc. Ser.},
pages = {662–667},
publisher = {Association for Computing Machinery},
abstract = {User Interaction for NFTs (Non-fungible Tokens) is gaining increasing attention. Although NFTs have been traditionally single-use and monolithic, recent applications aim to connect multimodal interaction with human behavior. This paper reviews the related technological approaches and business practices in NFT art. We highlight that multimodal interaction is a currently under-studied issue in mainstream NFT art, and conjecture that multimodal interaction is a crucial enabler for decentralization in the NFT community. We present a continuum theory and propose a framework combining a bottom-up approach with AI multimodal process. Through this framework, we put forward integrating human behavior data into generative NFT units, as "multimodal interactive NFT."Our work displays the possibilities of NFTs in the art world, beyond the traditional 2D and 3D static content. © 2022 ACM.},
keywords = {AI-generated art, Arts computing, Behavioral Research, Computation theory, Continuum mechanics, Decentralised, Human behaviors, Interaction, Multi-modal, multimodal, Multimodal Interaction, NFTs, Non-fungible token, Text-to-image, The metaverse},
pubstate = {published},
tppubtype = {inproceedings}
}