AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Coronado, A.; Carvalho, S. T.; Berretta, L.
See Through My Eyes: Using Multimodal Large Language Model for Describing Rendered Environments to Blind People Proceedings Article
In: IMX - Proc. ACM Int. Conf. Interact. Media Experiences, pp. 451–457, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071391-0 (ISBN).
Abstract | Links | BibTeX | Tags: Accessibility, Behavioral Research, Blind, Blind people, Helmet mounted displays, Human engineering, Human rehabilitation equipment, Interactive computer graphics, Interactive computer systems, Language Model, LLM, Multi-modal, Rendered environment, rendered environments, Spatial cognition, Virtual Reality, Vision aids, Visual impairment, Visual languages, Visually impaired people
@inproceedings{coronado_see_2025,
title = {See Through My Eyes: Using Multimodal Large Language Model for Describing Rendered Environments to Blind People},
author = {A. Coronado and S. T. Carvalho and L. Berretta},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007991842&doi=10.1145%2f3706370.3731641&partnerID=40&md5=2f7cb1535d39d5e59b1f43f773de3272},
doi = {10.1145/3706370.3731641},
isbn = {979-840071391-0 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {IMX - Proc. ACM Int. Conf. Interact. Media Experiences},
pages = {451–457},
publisher = {Association for Computing Machinery, Inc},
abstract = {Extended Reality (XR) is quickly expanding "as the next major technology wave in personal computing". Nevertheless, this expansion and adoption could also exclude certain disabled users, particularly people with visual impairment (VIP). According to the World Health Organization (WHO) in their 2019 publication, there were at least 2.2 billion people with visual impairment, a number that is also estimated to have increased in recent years. Therefore, it is important to include disabled users, especially visually impaired people, in the design of Head-Mounted Displays and Extended Reality environments. Indeed, this objective can be pursued by incorporating Multimodal Large Language Model (MLLM) technology, which can assist visually impaired people. As a case study, this study employs different prompts that result in environment descriptions from an MLLM integrated into a virtual reality (VR) escape room. Therefore, six potential prompts were engineered to generate valuable outputs for visually impaired users inside a VR environment. These outputs were evaluated using the G-Eval, and VIEScore metrics. Even though, the results show that the prompt patterns provided a description that aligns with the user's point of view, it is highly recommended to evaluate these outputs through "expected outputs"from Orientation and Mobility Specialists, and Sighted Guides. Furthermore, the subsequent step in the process is to evaluate these outputs by visually impaired people themselves to identify the most effective prompt pattern. © 2025 Copyright held by the owner/author(s).},
keywords = {Accessibility, Behavioral Research, Blind, Blind people, Helmet mounted displays, Human engineering, Human rehabilitation equipment, Interactive computer graphics, Interactive computer systems, Language Model, LLM, Multi-modal, Rendered environment, rendered environments, Spatial cognition, Virtual Reality, Vision aids, Visual impairment, Visual languages, Visually impaired people},
pubstate = {published},
tppubtype = {inproceedings}
}
Lakehal, A.; Alti, A.; Annane, B.
CORES: Context-Aware Emotion-Driven Recommendation System-Based LLM to Improve Virtual Shopping Experiences Journal Article
In: Future Internet, vol. 17, no. 2, 2025, ISSN: 19995903 (ISSN).
Abstract | Links | BibTeX | Tags: Context, Context-Aware, Customisation, Decisions makings, E- commerces, e-commerce, Emotion, emotions, Language Model, Large language model, LLM, Recommendation, Virtual environments, Virtual Reality, Virtual shopping
@article{lakehal_cores_2025,
title = {CORES: Context-Aware Emotion-Driven Recommendation System-Based LLM to Improve Virtual Shopping Experiences},
author = {A. Lakehal and A. Alti and B. Annane},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85218626299&doi=10.3390%2ffi17020094&partnerID=40&md5=a0f68e273de08b2c33d03da4cb6c19bb},
doi = {10.3390/fi17020094},
issn = {19995903 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {Future Internet},
volume = {17},
number = {2},
abstract = {In today’s business landscape, artificial intelligence (AI) plays a pivotal role in shopping processes and customization. As the demand for customization grows, virtual reality (VR) emerges as an innovative solution to improve users’ perception and decision making in virtual shopping experiences (VSEs). Despite its potential, limited research has explored the integration of contextual information and emotions in VR to deliver effective product recommendations. This paper presents CORES (context-aware emotion-driven recommendation system), a novel approach designed to enrich users’ experiences and to support decision making in VR. CORES combines advanced large language models (LLMs) and embedding-based context-aware recommendation strategies to provide customized products. Therefore, emotions are collected from social platforms, and relevant contextual information is matched to enable effective recommendation. Additionally, CORES leverages transformers and retrieval-augmented generation (RAG) capabilities to explain recommended items, facilitate VR visualization, and generate insights using various prompt templates. CORES is applied to a VR shop of different items. An empirical study validates the efficiency and accuracy of this approach, achieving a significant average accuracy of 97% and an acceptable response time of 0.3267s in dynamic shopping scenarios. © 2025 by the authors.},
keywords = {Context, Context-Aware, Customisation, Decisions makings, E- commerces, e-commerce, Emotion, emotions, Language Model, Large language model, LLM, Recommendation, Virtual environments, Virtual Reality, Virtual shopping},
pubstate = {published},
tppubtype = {article}
}
Xu, F.; Zhou, T.; Nguyen, T.; Bao, H.; Lin, C.; Du, J.
Integrating augmented reality and LLM for enhanced cognitive support in critical audio communications Journal Article
In: International Journal of Human Computer Studies, vol. 194, 2025, ISSN: 10715819 (ISSN).
Abstract | Links | BibTeX | Tags: Audio communications, Augmented Reality, Cognitive loads, Cognitive support, Decisions makings, Language Model, Large language model, LLM, Logic reasoning, Maintenance, Operations and maintenance, Oral communication, Situational awareness
@article{xu_integrating_2025,
title = {Integrating augmented reality and LLM for enhanced cognitive support in critical audio communications},
author = {F. Xu and T. Zhou and T. Nguyen and H. Bao and C. Lin and J. Du},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85208467299&doi=10.1016%2fj.ijhcs.2024.103402&partnerID=40&md5=153d095b837ee1666a7da0f7ed03362c},
doi = {10.1016/j.ijhcs.2024.103402},
issn = {10715819 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {International Journal of Human Computer Studies},
volume = {194},
abstract = {Operation and Maintenance (O&M) missions are often time-sensitive and accuracy-dependent, requiring rapid and precise information processing in noisy, chaotic environments where oral communication can lead to cognitive overload and impaired decision-making. Augmented Reality (AR) and Large Language Models (LLMs) offer potential for enhancing situational awareness and lowering cognitive load by integrating digital visualizations with the physical world and improving dialogue management. However, synthesizing these technologies into a real-time system that effectively aids operators remains a challenge. This study explores the integration of AR and GPT-4, an advanced LLM, in time-sensitive O&M tasks, aiming to enhance situational awareness and manage cognitive load during oral communications. A customized AR system, incorporating the Microsoft HoloLens2 for cognitive monitoring and GPT-4 for decision making assistance, was tested in a human subject experiment with 30 participants. The 2×2 factorial experiment evaluated the effects of AR and LLM assistance on task performance and cognitive load. Results demonstrated significant improvements in task accuracy and reductions in cognitive load, highlighting the effectiveness of AR and LLM integration in supporting O&M missions. These findings emphasize the need for further research to optimize operational strategies in mission critical environments. © 2024 Elsevier Ltd},
keywords = {Audio communications, Augmented Reality, Cognitive loads, Cognitive support, Decisions makings, Language Model, Large language model, LLM, Logic reasoning, Maintenance, Operations and maintenance, Oral communication, Situational awareness},
pubstate = {published},
tppubtype = {article}
}
2024
Pester, A.; Tammaa, A.; Gütl, C.; Steinmaurer, A.; El-Seoud, S. A.
Conversational Agents, Virtual Worlds, and Beyond: A Review of Large Language Models Enabling Immersive Learning Proceedings Article
In: IEEE Global Eng. Edu. Conf., EDUCON, IEEE Computer Society, 2024, ISBN: 21659559 (ISSN); 979-835039402-3 (ISBN).
Abstract | Links | BibTeX | Tags: Computational Linguistics, Computer aided instruction, Conversational Agents, Education, Immersive learning, Language Model, Large language model, Learning systems, Literature reviews, LLM, Metaverse, Metaverses, Natural language processing systems, Pedagogy, Survey literature review, Virtual Reality, Virtual worlds
@inproceedings{pester_conversational_2024,
title = {Conversational Agents, Virtual Worlds, and Beyond: A Review of Large Language Models Enabling Immersive Learning},
author = {A. Pester and A. Tammaa and C. Gütl and A. Steinmaurer and S. A. El-Seoud},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85199068668&doi=10.1109%2fEDUCON60312.2024.10578895&partnerID=40&md5=1b904fd8a5e06d7ced42a328c028bbb7},
doi = {10.1109/EDUCON60312.2024.10578895},
isbn = {21659559 (ISSN); 979-835039402-3 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {IEEE Global Eng. Edu. Conf., EDUCON},
publisher = {IEEE Computer Society},
abstract = {Large Language Models represent a significant breakthrough in Natural Language Processing research and opened a wide range of application domains. This paper demonstrates the successful integration of Large Language Models into immersive learning environments. The review highlights how this emerging technology aligns with pedagogical principles, enhancing the effectiveness of current educational systems. It also reflects recent advancements in integrating Large Language Models, including fine-tuning, hallucination reduction, fact-checking, and human evaluation of generated results. © 2024 IEEE.},
keywords = {Computational Linguistics, Computer aided instruction, Conversational Agents, Education, Immersive learning, Language Model, Large language model, Learning systems, Literature reviews, LLM, Metaverse, Metaverses, Natural language processing systems, Pedagogy, Survey literature review, Virtual Reality, Virtual worlds},
pubstate = {published},
tppubtype = {inproceedings}
}
Liu, M.; M'Hiri, F.
Beyond Traditional Teaching: Large Language Models as Simulated Teaching Assistants in Computer Science Proceedings Article
In: SIGCSE - Proc. ACM Tech. Symp. Comput. Sci. Educ., pp. 743–749, Association for Computing Machinery, Inc, 2024, ISBN: 979-840070423-9 (ISBN).
Abstract | Links | BibTeX | Tags: Adaptive teaching, ChatGPT, Computational Linguistics, CS education, E-Learning, Education computing, Engineering education, GPT, Language Model, LLM, machine learning, Machine-learning, Novice programmer, novice programmers, Openai, Programming, Python, Students, Teaching, Virtual Reality
@inproceedings{liu_beyond_2024,
title = {Beyond Traditional Teaching: Large Language Models as Simulated Teaching Assistants in Computer Science},
author = {M. Liu and F. M'Hiri},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85189289344&doi=10.1145%2f3626252.3630789&partnerID=40&md5=44ec79c8f005f4551c820c61f5b5d435},
doi = {10.1145/3626252.3630789},
isbn = {979-840070423-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {SIGCSE - Proc. ACM Tech. Symp. Comput. Sci. Educ.},
volume = {1},
pages = {743–749},
publisher = {Association for Computing Machinery, Inc},
abstract = {As the prominence of Large Language Models (LLMs) grows in various sectors, their potential in education warrants exploration. In this study, we investigate the feasibility of employing GPT-3.5 from OpenAI, as an LLM teaching assistant (TA) or a virtual TA in computer science (CS) courses. The objective is to enhance the accessibility of CS education while maintaining academic integrity by refraining from providing direct solutions to current-semester assignments. Targeting Foundations of Programming (COMP202), an undergraduate course that introduces students to programming with Python, we have developed a virtual TA using the LangChain framework, known for integrating language models with diverse data sources and environments. The virtual TA assists students with their code and clarifies complex concepts. For homework questions, it is designed to guide students with hints rather than giving out direct solutions. We assessed its performance first through a qualitative evaluation, then a survey-based comparative analysis, using a mix of questions commonly asked on the COMP202 discussion board and questions created by the authors. Our preliminary results indicate that the virtual TA outperforms human TAs on clarity and engagement, matching them on accuracy when the question is non-assignment-specific, for which human TAs still proved more reliable. These findings suggest that while virtual TAs, leveraging the capabilities of LLMs, hold great promise towards making CS education experience more accessible and engaging, their optimal use necessitates human supervision. We conclude by identifying several directions that could be explored in future implementations. © 2024 ACM.},
keywords = {Adaptive teaching, ChatGPT, Computational Linguistics, CS education, E-Learning, Education computing, Engineering education, GPT, Language Model, LLM, machine learning, Machine-learning, Novice programmer, novice programmers, Openai, Programming, Python, Students, Teaching, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Krauss, C.; Bassbouss, L.; Upravitelev, M.; An, T. -S.; Altun, D.; Reray, L.; Balitzki, E.; Tamimi, T. El; Karagülle, M.
Opportunities and Challenges in Developing Educational AI-Assistants for the Metaverse Proceedings Article
In: R.A., Sottilare; J., Schwarz (Ed.): Lect. Notes Comput. Sci., pp. 219–238, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 03029743 (ISSN); 978-303160608-3 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, AI-assistant, AI-Assistants, Computational Linguistics, Computer aided instruction, Concept-based, E-Learning, Education, Interoperability, Language Model, Large language model, large language models, Learning Environments, Learning systems, Learning Technologies, Learning technology, LLM, Metaverse, Metaverses, Natural language processing systems, Proof of concept, User interfaces, Virtual assistants, Virtual Reality
@inproceedings{krauss_opportunities_2024,
title = {Opportunities and Challenges in Developing Educational AI-Assistants for the Metaverse},
author = {C. Krauss and L. Bassbouss and M. Upravitelev and T. -S. An and D. Altun and L. Reray and E. Balitzki and T. El Tamimi and M. Karagülle},
editor = {Sottilare R.A. and Schwarz J.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85196214138&doi=10.1007%2f978-3-031-60609-0_16&partnerID=40&md5=9a66876cb30e9e5d287a86e6cfa66e05},
doi = {10.1007/978-3-031-60609-0_16},
isbn = {03029743 (ISSN); 978-303160608-3 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {14727 LNCS},
pages = {219–238},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {The paper explores the opportunities and challenges for metaverse learning environments with AI-Assistants based on Large Language Models. A proof of concept based on popular but proprietary technologies is presented that enables a natural language exchange between the user and an AI-based medical expert in a highly immersive environment based on the Unreal Engine. The answers generated by ChatGPT are not only played back lip-synchronously, but also visualized in the VR environment using a 3D model of a skeleton. Usability and user experience play a particularly important role in the development of the highly immersive AI-Assistant. The proof of concept serves to illustrate the opportunities and challenges that lie in the merging of large language models, metaverse applications and educational ecosystems, which are self-contained research areas. Development strategies, tools and interoperability standards will be presented to facilitate future developments in this triangle of tension. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.},
keywords = {3D modeling, AI-assistant, AI-Assistants, Computational Linguistics, Computer aided instruction, Concept-based, E-Learning, Education, Interoperability, Language Model, Large language model, large language models, Learning Environments, Learning systems, Learning Technologies, Learning technology, LLM, Metaverse, Metaverses, Natural language processing systems, Proof of concept, User interfaces, Virtual assistants, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Peretti, A.; Mazzola, M.; Capra, L.; Piazzola, M.; Carlevaro, C.
Seamless Human-Robot Interaction Through a Distributed Zero-Trust Architecture and Advanced User Interfaces Proceedings Article
In: C., Secchi; L., Marconi (Ed.): Springer. Proc. Adv. Robot., pp. 92–95, Springer Nature, 2024, ISBN: 25111256 (ISSN); 978-303176427-1 (ISBN).
Abstract | Links | BibTeX | Tags: Advanced user interfaces, Digital Twins, HRC, Human Robot Interaction, Human-Robot Collaboration, Humans-robot interactions, Industrial robots, Industry 4.0, Intelligent robots, Interaction platform, Language Model, Large language model, LLM, Problem oriented languages, Robot Operating System, Robot operating system 2, Robot-robot collaboration, ROS2, RRC, Wages, XR, ZTA
@inproceedings{peretti_seamless_2024,
title = {Seamless Human-Robot Interaction Through a Distributed Zero-Trust Architecture and Advanced User Interfaces},
author = {A. Peretti and M. Mazzola and L. Capra and M. Piazzola and C. Carlevaro},
editor = {Secchi C. and Marconi L.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85216090556&doi=10.1007%2f978-3-031-76428-8_18&partnerID=40&md5=9f58281f8a8c034fb45fed610ce64bd2},
doi = {10.1007/978-3-031-76428-8_18},
isbn = {25111256 (ISSN); 978-303176427-1 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Springer. Proc. Adv. Robot.},
volume = {33 SPAR},
pages = {92–95},
publisher = {Springer Nature},
abstract = {The proposed work presents a novel interaction platform designed to address the shortage of skilled workers in the labor market, facilitating the seamless integration of robotics and advanced user interfaces such as eXtended Reality (XR) to optimize Human-Robot Collaboration (HRC) as well as Robot-Robot Collaboration (RRC) in an Industry 4.0 scenario. One of the most challenging situations is to optimize and simplify the collaborations of humans and robots to decrease or avoid system slowdowns, blocks, or dangerous situations for both users and robots. The advent of the LLMs (Large Language Model) have been breakthrough the whole IT environment because they perform well in different scenario from human text generation to autonomous systems management. Due to their malleability, LLMs have a primary role for Human-Robot collaboration processes. For this reason, the platform comprises three key technical components: a distributed zero-trust architecture, a virtual avatar, and digital twins of robots powered by the Robot Operating System 2 (ROS2) platform. © The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.},
keywords = {Advanced user interfaces, Digital Twins, HRC, Human Robot Interaction, Human-Robot Collaboration, Humans-robot interactions, Industrial robots, Industry 4.0, Intelligent robots, Interaction platform, Language Model, Large language model, LLM, Problem oriented languages, Robot Operating System, Robot operating system 2, Robot-robot collaboration, ROS2, RRC, Wages, XR, ZTA},
pubstate = {published},
tppubtype = {inproceedings}
}
Rahmani, R.; Westin, T.; Nevelsteen, K.
Future Healthcare in Generative AI with Real Metaverse Proceedings Article
In: E.E., Shakshuki (Ed.): Procedia Comput. Sci., pp. 487–493, Elsevier B.V., 2024, ISBN: 18770509 (ISSN).
Abstract | Links | BibTeX | Tags: Adversarial machine learning, AI, Augmented Reality, Autism spectrum disorders, Contrastive Learning, Diseases, Edge Intelligence, Generative adversarial networks, Healthcare, Immersive learning, Independent living systems, Language Model, Large language model, LLM, Metaverses, Posttraumatic stress disorder, Real Metaverse, Social challenges, Virtual environments
@inproceedings{rahmani_future_2024,
title = {Future Healthcare in Generative AI with Real Metaverse},
author = {R. Rahmani and T. Westin and K. Nevelsteen},
editor = {Shakshuki E.E.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85214986921&doi=10.1016%2fj.procs.2024.11.137&partnerID=40&md5=3e25f2a1b023cd49f59a066a96bb2dd0},
doi = {10.1016/j.procs.2024.11.137},
isbn = {18770509 (ISSN)},
year = {2024},
date = {2024-01-01},
booktitle = {Procedia Comput. Sci.},
volume = {251},
pages = {487–493},
publisher = {Elsevier B.V.},
abstract = {The Metaverse offers a simulated environment that could transform healthcare by providing immersive learning experiences through Internet application and social form that integrates network of virtual reality environments. The Metaverse is expected to contribute to a new way of socializing, where users can enter a verse as avatars. The concept allows avatars to switch between verses seamlessly. Virtual Reality (VR) in healthcare has shown promise for social-skill training, especially for individuals with Autism Spectrum Disorder (ASD) and social challenge training of patients with Post-Traumatic Stress Disorder (PTSD) requiring adaptable support. The problem lies in the limited adaptability and functionality of existing Metaverse implementations for individuals with ASD and PTSD. While studies have explored various implementation ideas, such as VR platforms for training social skills, social challenge and context-aware Augmented Reality (AR) systems for daily activities, many lack adaptability of user input and output. A proposed solution involves a context-aware system using AI, Large Language Models (LLMs) and generative agents to support independent living for individuals with ASD and a tool to enhance emotional learning with PTSD. © 2024 The Authors.},
keywords = {Adversarial machine learning, AI, Augmented Reality, Autism spectrum disorders, Contrastive Learning, Diseases, Edge Intelligence, Generative adversarial networks, Healthcare, Immersive learning, Independent living systems, Language Model, Large language model, LLM, Metaverses, Posttraumatic stress disorder, Real Metaverse, Social challenges, Virtual environments},
pubstate = {published},
tppubtype = {inproceedings}
}
Amato, N.; Carolis, B. De; Gioia, F.; Venezia, M. N.; Palestra, G.; Loglisci, C.
Can an AI-driven VTuber engage People? The KawAIi Case Study Proceedings Article
In: A., Soto; E., Zangerle (Ed.): CEUR Workshop Proc., CEUR-WS, 2024, ISBN: 16130073 (ISSN).
Abstract | Links | BibTeX | Tags: 3D Avatars, Case-studies, Conversational Agents, Facial Expressions, Language Model, Live streaming, LLM, LLMs, Real- time, Three dimensional computer graphics, Virtual agent, Virtual Reality, YouTube
@inproceedings{amato_can_2024,
title = {Can an AI-driven VTuber engage People? The KawAIi Case Study},
author = {N. Amato and B. De Carolis and F. Gioia and M. N. Venezia and G. Palestra and C. Loglisci},
editor = {Soto A. and Zangerle E.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85190754935&partnerID=40&md5=bd76d56b13e328027aa1b458849cf73f},
isbn = {16130073 (ISSN)},
year = {2024},
date = {2024-01-01},
booktitle = {CEUR Workshop Proc.},
volume = {3660},
publisher = {CEUR-WS},
abstract = {Live streaming has become increasingly popular, with most streamers presenting their real-life appearance. However, Virtual YouTubers (VTubers), virtual 2D or 3D avatars that are voiced by humans, are emerging as live streamers and attracting a growing viewership. This paper presents the development of a conversational agent, named KawAIi, embodied in a 2D character that, while accurately and promptly responding to user requests, provides an entertaining experience in streaming chat platforms such as YouTube while providing adequate real-time support. The agent relies on the Vicuna 7B GPTQ 4-bit Large Language Model (LLM). In addition, KawAIi uses a BERT-based model for analyzing the sentence generated by the model in terms of conveyed emotion and shows self-emotion awareness through facial expressions. Tested with users, the system has demonstrated a good ability to handle the interaction with the user while maintaining a pleasant user experience. In particular, KawAIi has been evaluated positively in terms of engagement and competence on various topics. The results show the potential of this technology to enrich interactivity in streaming platforms and offer a promising model for future online assistance contexts. © 2024 Copyright for this paper by its authors.},
keywords = {3D Avatars, Case-studies, Conversational Agents, Facial Expressions, Language Model, Live streaming, LLM, LLMs, Real- time, Three dimensional computer graphics, Virtual agent, Virtual Reality, YouTube},
pubstate = {published},
tppubtype = {inproceedings}
}
Lee, S.; Park, W.; Lee, K.
Building Knowledge Base of 3D Object Assets Using Multimodal LLM AI Model Proceedings Article
In: Int. Conf. ICT Convergence, pp. 416–418, IEEE Computer Society, 2024, ISBN: 21621233 (ISSN); 979-835036463-7 (ISBN).
Abstract | Links | BibTeX | Tags: 3D object, Asset management, Content services, Exponentials, Information Management, Knowledge Base, Language Model, Large language model, LLM, Multi-modal, Multi-Modal AI, Reusability, Visual effects, XR
@inproceedings{lee_building_2024,
title = {Building Knowledge Base of 3D Object Assets Using Multimodal LLM AI Model},
author = {S. Lee and W. Park and K. Lee},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85217636269&doi=10.1109%2fICTC62082.2024.10827434&partnerID=40&md5=581ee8ca50eb3dae15dc9675971cf428},
doi = {10.1109/ICTC62082.2024.10827434},
isbn = {21621233 (ISSN); 979-835036463-7 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Int. Conf. ICT Convergence},
pages = {416–418},
publisher = {IEEE Computer Society},
abstract = {The proliferation of various XR (eXtended Reality) services and the increasing incorporation of visual effects into existing content services have led to an exponential rise in the demand for 3D object assets. This paper describes an LLM (Large Language Model)-based multimodal AI model pipeline that can be applied to a generative AI model for creating new 3D objects or restructuring the asset management system to enhance the reusability of existing 3D objects. By leveraging a multimodal AI model, we derived descriptive text for assets such as 3D object, 2D image at a human-perceptible level, rather than mere data, and subsequently used an LLM to generate knowledge triplets for constructing an asset knowledge base. The applicability of this pipeline was verified using actual 3D objects from a content production company. Future work will focus on improving the quality of the generated knowledge triplets themselves by training the multimodal AI model with real-world content usage assets. © 2024 IEEE.},
keywords = {3D object, Asset management, Content services, Exponentials, Information Management, Knowledge Base, Language Model, Large language model, LLM, Multi-modal, Multi-Modal AI, Reusability, Visual effects, XR},
pubstate = {published},
tppubtype = {inproceedings}
}
Liang, Q.; Chen, Y.; Li, W.; Lai, M.; Ni, W.; Qiu, H.
In: L., Zhang; W., Yu; Q., Wang; Y., Laili; Y., Liu (Ed.): Commun. Comput. Info. Sci., pp. 12–24, Springer Science and Business Media Deutschland GmbH, 2024, ISBN: 18650929 (ISSN); 978-981973947-9 (ISBN).
Abstract | Links | BibTeX | Tags: Augmented Reality, Glass, Identity recognition, Internet of Things, Internet of things technologies, IoT, Language learning, Learning systems, LLM, Object Detection, Objects detection, Open Vocabulary Object Detection, Recognition systems, Semantics, Telephone sets, Translation (languages), Translation systems, Visual languages, Wearable computers, Wearable device, Wearable devices
@inproceedings{liang_iknowisee_2024,
title = {iKnowiSee: AR Glasses with Language Learning Translation System and Identity Recognition System Built Based on Large Pre-trained Models of Language and Vision and Internet of Things Technology},
author = {Q. Liang and Y. Chen and W. Li and M. Lai and W. Ni and H. Qiu},
editor = {Zhang L. and Yu W. and Wang Q. and Laili Y. and Liu Y.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85200663840&doi=10.1007%2f978-981-97-3948-6_2&partnerID=40&md5=a0324ba6108674b1d39a338574269d60},
doi = {10.1007/978-981-97-3948-6_2},
isbn = {18650929 (ISSN); 978-981973947-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Commun. Comput. Info. Sci.},
volume = {2139 CCIS},
pages = {12–24},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {AR glasses used in daily life have made good progress and have some practical value.However, the current design concept of AR glasses is basically to simply port the content of a cell phone and act as a secondary screen for the phone. In contrast, the AR glasses we designed are based on actual situations, focus on real-world interactions, and utilize IoT technology with the aim of enabling users to fully extract and utilize the digital information in their lives. We have created two innovative features, one is a language learning translation system for users to learn foreign languages, which integrates a large language model with an open vocabulary recognition model to fully extract the visual semantic information of the scene; and the other is a social conferencing system, which utilizes the IoT cloud, pipe, edge, and end development to reduce the cost of communication and improve the efficiency of exchanges in social situations. © The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2024.},
keywords = {Augmented Reality, Glass, Identity recognition, Internet of Things, Internet of things technologies, IoT, Language learning, Learning systems, LLM, Object Detection, Objects detection, Open Vocabulary Object Detection, Recognition systems, Semantics, Telephone sets, Translation (languages), Translation systems, Visual languages, Wearable computers, Wearable device, Wearable devices},
pubstate = {published},
tppubtype = {inproceedings}
}
Bandara, E.; Foytik, P.; Shetty, S.; Hassanzadeh, A.
Generative-AI(with Custom-Trained Meta's Llama2 LLM), Blockchain, NFT, Federated Learning and PBOM Enabled Data Security Architecture for Metaverse on 5G/6G Environment Proceedings Article
In: Proc. - IEEE Int. Conf. Mob. Ad-Hoc Smart Syst., MASS, pp. 118–124, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-835036399-9 (ISBN).
Abstract | Links | BibTeX | Tags: 5G, 6G, Adversarial machine learning, Bill of materials, Block-chain, Blockchain, Curricula, Data privacy, Distance education, Federated learning, Generative adversarial networks, Generative-AI, Hardware security, Llama2, LLM, Medium access control, Metaverse, Metaverses, Network Security, Nft, Non-fungible token, Personnel training, Problem oriented languages, Reference architecture, Steganography
@inproceedings{bandara_generative-aicustom-trained_2024,
title = {Generative-AI(with Custom-Trained Meta's Llama2 LLM), Blockchain, NFT, Federated Learning and PBOM Enabled Data Security Architecture for Metaverse on 5G/6G Environment},
author = {E. Bandara and P. Foytik and S. Shetty and A. Hassanzadeh},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85210243120&doi=10.1109%2fMASS62177.2024.00026&partnerID=40&md5=70d21ac1e9c7b886da14825376919cac},
doi = {10.1109/MASS62177.2024.00026},
isbn = {979-835036399-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Conf. Mob. Ad-Hoc Smart Syst., MASS},
pages = {118–124},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {The Metaverse is an integrated network of 3D virtual worlds accessible through a virtual reality headset. Its impact on data privacy and security is increasingly recognized as a major concern. There is a growing interest in developing a reference architecture that describes the four core aspects of its data: acquisition, storage, sharing, and interoperability. Establishing a secure data architecture is imperative to manage users' personal data and facilitate trusted AR/VR and AI/ML solutions within the Metaverse. This paper details a reference architecture empowered by Generative-AI, Blockchain, Federated Learning, and Non-Fungible Tokens (NFTs). Within this archi-tecture, various resource providers collaborate via the blockchain network. Handling personal user data and resource provider identities is executed through a Self-Sovereign Identity-enabled privacy-preserving framework. AR/NR devices in the Metaverse are represented as NFT tokens available for user purchase. Software updates and supply-chain verification for these devices are managed using a Software Bill of Materials (SBOM) and a Pipeline Bill of Materials (PBOM) verification system. Moreover, a custom-trained Llama2 LLM from Meta has been integrated to generate PBOMs for AR/NR devices' software updates, thereby preventing malware intrusions and data breaches. This Llama2-13B LLM has been quantized and fine-tuned using Qlora to ensure optimal performance on consumer-grade hardware. The provenance of AI/ML models used in the Metaverse is encapsu-lated as Model Card objects, allowing external parties to audit and verify them, thus mitigating adversarial learning attacks within these models. To the best of our knowledge, this is the very first research effort aimed at standardizing PBOM schemas and integrating Language Model algorithms for the generation of PBOMs. Additionally, a proposed mechanism facilitates different AI/ML providers in training their machine learning models using a privacy-preserving federated learning approach. Authorization of communications among AR/VR devices in the Metaverse is conducted through a Zero-Trust security-enabled rule engine. A system testbed has been implemented within a 5G environment, utilizing Ericsson new Radio with Open5GS 5G core. © 2024 IEEE.},
keywords = {5G, 6G, Adversarial machine learning, Bill of materials, Block-chain, Blockchain, Curricula, Data privacy, Distance education, Federated learning, Generative adversarial networks, Generative-AI, Hardware security, Llama2, LLM, Medium access control, Metaverse, Metaverses, Network Security, Nft, Non-fungible token, Personnel training, Problem oriented languages, Reference architecture, Steganography},
pubstate = {published},
tppubtype = {inproceedings}
}
Klein, A.; Arnowitz, E.
AI in mixed reality - Copilot on HoloLens: Spatial computing with large language models Proceedings Article
In: S.N., Spencer (Ed.): Proc. - SIGGRAPH Real-Time Live!, Association for Computing Machinery, Inc, 2024, ISBN: 979-840070526-7 (ISBN).
Abstract | Links | BibTeX | Tags: 3D, AI, AR, Gesture, Gestures, HoloLens, Language Model, LLM, Mixed reality, Real- time, Real-time, Spatial computing, User experience design, User interfaces, Voice
@inproceedings{klein_ai_2024,
title = {AI in mixed reality - Copilot on HoloLens: Spatial computing with large language models},
author = {A. Klein and E. Arnowitz},
editor = {Spencer S.N.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85200657459&doi=10.1145%2f3641520.3665305&partnerID=40&md5=07d385771b8813c1fafa0efb7ae7e9f2},
doi = {10.1145/3641520.3665305},
isbn = {979-840070526-7 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - SIGGRAPH Real-Time Live!},
publisher = {Association for Computing Machinery, Inc},
abstract = {Mixed reality together with AI presents a human-first interface that promises to transform operations. Copilot can assist industrial workers in real-time with speech and holograms; generative AI is used to search technical documentation, service records, training content, and other sources. Copilot then summarizes to provide interactive guidance. © 2024 Owner/Author.},
keywords = {3D, AI, AR, Gesture, Gestures, HoloLens, Language Model, LLM, Mixed reality, Real- time, Real-time, Spatial computing, User experience design, User interfaces, Voice},
pubstate = {published},
tppubtype = {inproceedings}
}