AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Chen, J.; Wu, X.; Lan, T.; Li, B.
LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2715–2724, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: % reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality
@article{chen_llmer_2025,
title = {LLMER: Crafting Interactive Extended Reality Worlds with JSON Data Generated by Large Language Models},
author = {J. Chen and X. Wu and T. Lan and B. Li},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003825793&doi=10.1109%2fTVCG.2025.3549549&partnerID=40&md5=da4681d0714548e3a7e0c8c3295d2348},
doi = {10.1109/TVCG.2025.3549549},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2715–2724},
abstract = {The integration of Large Language Models (LLMs) like GPT-4 with Extended Reality (XR) technologies offers the potential to build truly immersive XR environments that interact with human users through natural language, e.g., generating and animating 3D scenes from audio inputs. However, the complexity of XR environments makes it difficult to accurately extract relevant contextual data and scene/object parameters from an overwhelming volume of XR artifacts. It leads to not only increased costs with pay-per-use models, but also elevated levels of generation errors. Moreover, existing approaches focusing on coding script generation are often prone to generation errors, resulting in flawed or invalid scripts, application crashes, and ultimately a degraded user experience. To overcome these challenges, we introduce LLMER, a novel framework that creates interactive XR worlds using JSON data generated by LLMs. Unlike prior approaches focusing on coding script generation, LLMER translates natural language inputs into JSON data, significantly reducing the likelihood of application crashes and processing latency. It employs a multi-stage strategy to supply only the essential contextual information adapted to the user's request and features multiple modules designed for various XR tasks. Our preliminary user study reveals the effectiveness of the proposed system, with over 80% reduction in consumed tokens and around 60% reduction in task completion time compared to state-of-the-art approaches. The analysis of users' feedback also illuminates a series of directions for further optimization. © 1995-2012 IEEE.},
keywords = {% reductions, 3D modeling, algorithm, Algorithms, Augmented Reality, Coding errors, Computer graphics, Computer interaction, computer interface, Computer simulation languages, Extended reality, generative artificial intelligence, human, Human users, human-computer interaction, Humans, Imaging, Immersive, Language, Language Model, Large language model, large language models, Metadata, Natural Language Processing, Natural language processing systems, Natural languages, procedures, Script generation, Spatio-temporal data, Three dimensional computer graphics, Three-Dimensional, three-dimensional imaging, User-Computer Interface, Virtual Reality},
pubstate = {published},
tppubtype = {article}
}
Kim, Y.; Aamir, Z.; Singh, M.; Boorboor, S.; Mueller, K.; Kaufman, A. E.
Explainable XR: Understanding User Behaviors of XR Environments Using LLM-Assisted Analytics Framework Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2756–2766, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: adult, Agnostic, Article, Assistive, Cross Reality, Data Analytics, Data collection, data interpretation, Data recording, Data visualization, Extended reality, human, Language Model, Large language model, large language models, Multi-modal, Multimodal Data Collection, normal human, Personalized assistive technique, Personalized Assistive Techniques, recorder, Spatio-temporal data, therapy, user behavior, User behaviors, Virtual addresses, Virtual environments, Virtual Reality, Visual analytics, Visual languages
@article{kim_explainable_2025,
title = {Explainable XR: Understanding User Behaviors of XR Environments Using LLM-Assisted Analytics Framework},
author = {Y. Kim and Z. Aamir and M. Singh and S. Boorboor and K. Mueller and A. E. Kaufman},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003815583&doi=10.1109%2fTVCG.2025.3549537&partnerID=40&md5=1085b698db06656985f80418cb37b773},
doi = {10.1109/TVCG.2025.3549537},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2756–2766},
abstract = {We present Explainable XR, an end-to-end framework for analyzing user behavior in diverse eXtended Reality (XR) environments by leveraging Large Language Models (LLMs) for data interpretation assistance. Existing XR user analytics frameworks face challenges in handling cross-virtuality - AR, VR, MR - transitions, multi-user collaborative application scenarios, and the complexity of multimodal data. Explainable XR addresses these challenges by providing a virtuality-agnostic solution for the collection, analysis, and visualization of immersive sessions. We propose three main components in our framework: (1) A novel user data recording schema, called User Action Descriptor (UAD), that can capture the users' multimodal actions, along with their intents and the contexts; (2) a platform-agnostic XR session recorder, and (3) a visual analytics interface that offers LLM-assisted insights tailored to the analysts' perspectives, facilitating the exploration and analysis of the recorded XR session data. We demonstrate the versatility of Explainable XR by demonstrating five use-case scenarios, in both individual and collaborative XR applications across virtualities. Our technical evaluation and user studies show that Explainable XR provides a highly usable analytics solution for understanding user actions and delivering multifaceted, actionable insights into user behaviors in immersive environments. © 1995-2012 IEEE.},
keywords = {adult, Agnostic, Article, Assistive, Cross Reality, Data Analytics, Data collection, data interpretation, Data recording, Data visualization, Extended reality, human, Language Model, Large language model, large language models, Multi-modal, Multimodal Data Collection, normal human, Personalized assistive technique, Personalized Assistive Techniques, recorder, Spatio-temporal data, therapy, user behavior, User behaviors, Virtual addresses, Virtual environments, Virtual Reality, Visual analytics, Visual languages},
pubstate = {published},
tppubtype = {article}
}
2024
Ding, P.; Liu, J.; Sun, M.; Li, L.; Liu, H.
Enhancing Computational Processing Performance for Generative AI Large Models with Autonomous Decision-Making in Metaverse Applications Proceedings Article
In: Proc. - IEEE Int. Conf. Metaverse Comput., Netw., Appl., MetaCom, pp. 253–258, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-833151599-7 (ISBN).
Abstract | Links | BibTeX | Tags: Adversarial machine learning, AGI (Artificial General Intelligence), Artificial general intelligence, Artificial general intelligences, Autonomous decision, Autonomous Decision-Making, Data assimilation, Data integration, Decisions makings, Digital Twin Technology, Emotion Recognition, Generative adversarial networks, Generative AI large model, Generative AI Large Models, Large models, Metaverse, Metaverses, Model Acceleration, Model Compression, Multi agent systems, Multi-agent systems, Multi-modal data, Multi-Modal Data Integration, Multiagent systems (MASs), Reinforcement Learning, Reinforcement learnings, Spatio-temporal data
@inproceedings{ding_enhancing_2024,
title = {Enhancing Computational Processing Performance for Generative AI Large Models with Autonomous Decision-Making in Metaverse Applications},
author = {P. Ding and J. Liu and M. Sun and L. Li and H. Liu},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85211489063&doi=10.1109%2fMetaCom62920.2024.00048&partnerID=40&md5=ae085a7d90b12c9090f5bf7a274bc7ce},
doi = {10.1109/MetaCom62920.2024.00048},
isbn = {979-833151599-7 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Conf. Metaverse Comput., Netw., Appl., MetaCom},
pages = {253–258},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {We explore how to enhance the computational processing performance for generative AI large models with autonomous decision-making in metaverse applications. We first introduce the relationship between AI large models and the Metaverse. We elaborate on the application scenarios of generative AI large models in Metaverse, including real-time weather simulation, embodied intelligence of agents, dynamic environment interaction, and user emotion recognition. We then propose the method of Multi-Dimensional Optimization Generation Framework (MDOGF) to improve computational processing performance. The experiment results show great improvement in computational processing performance. © 2024 IEEE.},
keywords = {Adversarial machine learning, AGI (Artificial General Intelligence), Artificial general intelligence, Artificial general intelligences, Autonomous decision, Autonomous Decision-Making, Data assimilation, Data integration, Decisions makings, Digital Twin Technology, Emotion Recognition, Generative adversarial networks, Generative AI large model, Generative AI Large Models, Large models, Metaverse, Metaverses, Model Acceleration, Model Compression, Multi agent systems, Multi-agent systems, Multi-modal data, Multi-Modal Data Integration, Multiagent systems (MASs), Reinforcement Learning, Reinforcement learnings, Spatio-temporal data},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Feng, Y.; Zhu, H.; Peng, D.; Peng, X.; Hu, P.
RONO: Robust Discriminative Learning with Noisy Labels for 2D-3D Cross-Modal Retrieval Proceedings Article
In: Proc IEEE Comput Soc Conf Comput Vision Pattern Recognit, pp. 11610–11619, IEEE Computer Society, 2023, ISBN: 10636919 (ISSN).
Abstract | Links | BibTeX | Tags: 3D content, 3D data, 3D modeling, Adversarial machine learning, Contrastive Learning, Cross-modal, Discriminative learning, Federated learning, Heterogeneous structures, Learning mechanism, Learning performance, Metaverses, Multi-modal learning, Noisy labels, Spatio-temporal data
@inproceedings{feng_rono_2023,
title = {RONO: Robust Discriminative Learning with Noisy Labels for 2D-3D Cross-Modal Retrieval},
author = {Y. Feng and H. Zhu and D. Peng and X. Peng and P. Hu},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85170845124&doi=10.1109%2fCVPR52729.2023.01117&partnerID=40&md5=2eee285207ff3ea8e774480e29d96ec1},
doi = {10.1109/CVPR52729.2023.01117},
isbn = {10636919 (ISSN)},
year = {2023},
date = {2023-01-01},
booktitle = {Proc IEEE Comput Soc Conf Comput Vision Pattern Recognit},
volume = {2023-June},
pages = {11610–11619},
publisher = {IEEE Computer Society},
abstract = {Recently, with the advent of Metaverse and AI Generated Content, cross-modal retrieval becomes popular with a burst of 2D and 3D data. However, this problem is challenging given the heterogeneous structure and semantic discrepancies. Moreover, imperfect annotations are ubiquitous given the ambiguous 2D and 3D content, thus inevitably producing noisy labels to degrade the learning performance. To tackle the problem, this paper proposes a robust 2D-3D retrieval framework (RONO) to robustly learn from noisy multimodal data. Specifically, one novel Robust Discriminative Center Learning mechanism (RDCL) is proposed in RONO to adaptively distinguish clean and noisy samples for respectively providing them with positive and negative optimization directions, thus mitigating the negative impact of noisy labels. Besides, we present a Shared Space Consistency Learning mechanism (SSCL) to capture the intrinsic information inside the noisy data by minimizing the cross-modal and semantic discrepancy between common space and label space simultaneously. Comprehensive mathematical analyses are given to theoretically prove the noise tolerance of the proposed method. Furthermore, we conduct extensive experiments on four 3D-model multimodal datasets to verify the effectiveness of our method by comparing it with 15 state-of-the-art methods. © 2023 IEEE.},
keywords = {3D content, 3D data, 3D modeling, Adversarial machine learning, Contrastive Learning, Cross-modal, Discriminative learning, Federated learning, Heterogeneous structures, Learning mechanism, Learning performance, Metaverses, Multi-modal learning, Noisy labels, Spatio-temporal data},
pubstate = {published},
tppubtype = {inproceedings}
}