AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Cruz, T. A. Da; Munoz, O.; Giligny, F.; Gouranton, V.
For a Perception of Monumentality in Eastern Arabia from the Neolithic to the Bronze Age: 3D Reconstruction and Multidimensional Simulations of Monuments and Landscapes Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 47–50, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833151484-6 (ISBN).
Abstract | Links | BibTeX | Tags: 3D reconstruction, 4D simulations, Archaeological Site, Bronze age, Digital elevation model, Eastern Arabia, Eastern arabium, Monumentality, Multidimensional simulation, Simulation virtual realities, Spatial dimension, Temporal dimensions, Three dimensional computer graphics, Virtual Reality
@inproceedings{da_cruz_for_2025,
title = {For a Perception of Monumentality in Eastern Arabia from the Neolithic to the Bronze Age: 3D Reconstruction and Multidimensional Simulations of Monuments and Landscapes},
author = {T. A. Da Cruz and O. Munoz and F. Giligny and V. Gouranton},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005139996&doi=10.1109%2fVRW66409.2025.00018&partnerID=40&md5=14e05ff7019a4c9d712fe42aef776c8d},
doi = {10.1109/VRW66409.2025.00018},
isbn = {979-833151484-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},
pages = {47–50},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {The monumentality of Neolithic and Early Bronze Age (6th to 3rd millennium BC) structures in the Arabian Peninsula has never been approached through a comprehensive approach of simulations and reconstructions. As a result, its perception remains understudied. By combining archaeological and paleoenvironmental data, 3D reconstruction, 4D simulations, virtual reality and generative AI, this PhD research project proposes to analyse the perception of monuments, exploring their spatial, visual and temporal dimensions, in order to answer to the following question: how can we reconstruct and analyse the perception of monumentality in Eastern Arabia through 4D simulations, and how can the study of this perception influence our understanding of monumentality and territories?This article presents a work in progress, after three months of research of which one month on the Dhabtiyah archaeological site (Saudi Arabia, Eastern Province). © 2025 IEEE.},
keywords = {3D reconstruction, 4D simulations, Archaeological Site, Bronze age, Digital elevation model, Eastern Arabia, Eastern arabium, Monumentality, Multidimensional simulation, Simulation virtual realities, Spatial dimension, Temporal dimensions, Three dimensional computer graphics, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Sajiukumar, A.; Ranjan, A.; Parvathi, P. K.; Satheesh, A.; Udayan, J. Divya; Subramaniam, U.
Generative AI-Enabled Virtual Twin for Meeting Assistants Proceedings Article
In: T., Saba; A., Rehman (Ed.): Proc. - Int. Women Data Sci. Conf. at Prince Sultan Univ., WiDS-PSU, pp. 60–65, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833152092-2 (ISBN).
Abstract | Links | BibTeX | Tags: 3D avatar generation, 3D Avatars, 3D reconstruction, AI-augmented interaction, Augmented Reality, Communication and collaborations, Conversational AI, Neural radiation field, neural radiation fields (NeRF), Radiation field, Real time performance, real-time performance, Three dimensional computer graphics, Virtual spaces, Voice cloning
@inproceedings{sajiukumar_generative_2025,
title = {Generative AI-Enabled Virtual Twin for Meeting Assistants},
author = {A. Sajiukumar and A. Ranjan and P. K. Parvathi and A. Satheesh and J. Divya Udayan and U. Subramaniam},
editor = {Saba T. and Rehman A.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007691247&doi=10.1109%2fWiDS-PSU64963.2025.00025&partnerID=40&md5=f0bfb74a8f854c427054c73582909185},
doi = {10.1109/WiDS-PSU64963.2025.00025},
isbn = {979-833152092-2 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - Int. Women Data Sci. Conf. at Prince Sultan Univ., WiDS-PSU},
pages = {60–65},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {The growing dependence on virtual spaces for communication and collaboration has transformed interactions in numerous industries, ranging from professional meetings to education, entertainment, and healthcare. Despite the advancement of AI technologies such as three-dimensional modeling, voice cloning, and conversational AI, the convergence of these technologies in a single platform is still challenging. This paper introduces a unified framework that brings together state-of-the-art 3D avatar generation, real-time voice cloning, and conversational AI to enhance virtual interactions. The system utilizes Triplane neural representations and neural radiation fields (NeRF) for high-fidelity 3D avatar generation, speaker encoders coupled with Tacotron 2 and WaveRNN for natural voice cloning, and a context-aware chat algorithm for adaptive conversations. By overcoming the challenges of customization, integration, and real-time performance, the proposed framework addresses the increasing needs for realistic virtual representations, setting new benchmarks for AI-augmented interaction in virtual conferences, online representation, education, and healthcare. © 2025 IEEE.},
keywords = {3D avatar generation, 3D Avatars, 3D reconstruction, AI-augmented interaction, Augmented Reality, Communication and collaborations, Conversational AI, Neural radiation field, neural radiation fields (NeRF), Radiation field, Real time performance, real-time performance, Three dimensional computer graphics, Virtual spaces, Voice cloning},
pubstate = {published},
tppubtype = {inproceedings}
}
Song, T.; Pabst, F.; Eck, U.; Navab, N.
Enhancing Patient Acceptance of Robotic Ultrasound through Conversational Virtual Agent and Immersive Visualizations Journal Article
In: IEEE Transactions on Visualization and Computer Graphics, vol. 31, no. 5, pp. 2901–2911, 2025, ISSN: 10772626 (ISSN).
Abstract | Links | BibTeX | Tags: 3D reconstruction, adult, Augmented Reality, Computer graphics, computer interface, echography, female, human, Humans, Imaging, Intelligent robots, Intelligent virtual agents, Language Model, male, Medical robotics, Middle Aged, Mixed reality, Patient Acceptance of Health Care, patient attitude, Patient comfort, procedures, Real-world, Reality visualization, Robotic Ultrasound, Robotics, Three-Dimensional, three-dimensional imaging, Trust and Acceptance, Ultrasonic applications, Ultrasonic equipment, Ultrasonography, Ultrasound probes, User-Computer Interface, Virtual agent, Virtual assistants, Virtual environments, Virtual Reality, Visual languages, Visualization, Young Adult
@article{song_enhancing_2025,
title = {Enhancing Patient Acceptance of Robotic Ultrasound through Conversational Virtual Agent and Immersive Visualizations},
author = {T. Song and F. Pabst and U. Eck and N. Navab},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105003687673&doi=10.1109%2fTVCG.2025.3549181&partnerID=40&md5=1d46569933582ecf5e967f0794aafc07},
doi = {10.1109/TVCG.2025.3549181},
issn = {10772626 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IEEE Transactions on Visualization and Computer Graphics},
volume = {31},
number = {5},
pages = {2901–2911},
abstract = {Robotic ultrasound systems have the potential to improve medical diagnostics, but patient acceptance remains a key challenge. To address this, we propose a novel system that combines an AI-based virtual agent, powered by a large language model (LLM), with three mixed reality visualizations aimed at enhancing patient comfort and trust. The LLM enables the virtual assistant to engage in natural, conversational dialogue with patients, answering questions in any format and offering real-time reassurance, creating a more intelligent and reliable interaction. The virtual assistant is animated as controlling the ultrasound probe, giving the impression that the robot is guided by the assistant. The first visualization employs augmented reality (AR), allowing patients to see the real world and the robot with the virtual avatar superimposed. The second visualization is an augmented virtuality (AV) environment, where the real-world body part being scanned is visible, while a 3D Gaussian Splatting reconstruction of the room, excluding the robot, forms the virtual environment. The third is a fully immersive virtual reality (VR) experience, featuring the same 3D reconstruction but entirely virtual, where the patient sees a virtual representation of their body being scanned in a robot-free environment. In this case, the virtual ultrasound probe, mirrors the movement of the probe controlled by the robot, creating a synchronized experience as it touches and moves over the patient's virtual body. We conducted a comprehensive agent-guided robotic ultrasound study with all participants, comparing these visualizations against a standard robotic ultrasound procedure. Results showed significant improvements in patient trust, acceptance, and comfort. Based on these findings, we offer insights into designing future mixed reality visualizations and virtual agents to further enhance patient comfort and acceptance in autonomous medical procedures. © 1995-2012 IEEE.},
keywords = {3D reconstruction, adult, Augmented Reality, Computer graphics, computer interface, echography, female, human, Humans, Imaging, Intelligent robots, Intelligent virtual agents, Language Model, male, Medical robotics, Middle Aged, Mixed reality, Patient Acceptance of Health Care, patient attitude, Patient comfort, procedures, Real-world, Reality visualization, Robotic Ultrasound, Robotics, Three-Dimensional, three-dimensional imaging, Trust and Acceptance, Ultrasonic applications, Ultrasonic equipment, Ultrasonography, Ultrasound probes, User-Computer Interface, Virtual agent, Virtual assistants, Virtual environments, Virtual Reality, Visual languages, Visualization, Young Adult},
pubstate = {published},
tppubtype = {article}
}
Behravan, M.; Gračanin, D.
From Voices to Worlds: Developing an AI-Powered Framework for 3D Object Generation in Augmented Reality Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 150–155, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833151484-6 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D object, 3D Object Generation, 3D reconstruction, Augmented Reality, Cutting edges, Generative AI, Interactive computer systems, Language Model, Large language model, large language models, matrix, Multilingual speech interaction, Real- time, Speech enhancement, Speech interaction, Volume Rendering
@inproceedings{behravan_voices_2025,
title = {From Voices to Worlds: Developing an AI-Powered Framework for 3D Object Generation in Augmented Reality},
author = {M. Behravan and D. Gračanin},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005153589&doi=10.1109%2fVRW66409.2025.00038&partnerID=40&md5=b8aaab4e2378cde3595d98d79266d371},
doi = {10.1109/VRW66409.2025.00038},
isbn = {979-833151484-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},
pages = {150–155},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {This paper presents Matrix, an advanced AI-powered framework designed for real-time 3D object generation in Augmented Reality (AR) environments. By integrating a cutting-edge text-to-3D generative AI model, multilingual speech-to-text translation, and large language models (LLMs), the system enables seamless user interactions through spoken commands. The framework processes speech inputs, generates 3D objects, and provides object recommendations based on contextual understanding, enhancing AR experiences. A key feature of this framework is its ability to optimize 3D models by reducing mesh complexity, resulting in significantly smaller file sizes and faster processing on resource-constrained AR devices. Our approach addresses the challenges of high GPU usage, large model output sizes, and real-time system responsiveness, ensuring a smoother user experience. Moreover, the system is equipped with a pre-generated object repository, further reducing GPU load and improving efficiency. We demonstrate the practical applications of this framework in various fields such as education, design, and accessibility, and discuss future enhancements including image-to-3D conversion, environmental object detection, and multimodal support. The open-source nature of the framework promotes ongoing innovation and its utility across diverse industries. © 2025 IEEE.},
keywords = {3D modeling, 3D object, 3D Object Generation, 3D reconstruction, Augmented Reality, Cutting edges, Generative AI, Interactive computer systems, Language Model, Large language model, large language models, matrix, Multilingual speech interaction, Real- time, Speech enhancement, Speech interaction, Volume Rendering},
pubstate = {published},
tppubtype = {inproceedings}
}
Mao, H.; Xu, Z.; Wei, S.; Quan, Y.; Deng, N.; Yang, X.
LLM-powered Gaussian Splatting in VR interactions Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW, pp. 1654–1655, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833151484-6 (ISBN).
Abstract | Links | BibTeX | Tags: 3D Gaussian Splatting, 3D reconstruction, Content creation, Digital elevation model, Gaussians, High quality, Language Model, material analysis, Materials analysis, Physical simulation, Quality rendering, Rendering (computer graphics), Splatting, Virtual Reality, Volume Rendering, VR systems
@inproceedings{mao_llm-powered_2025,
title = {LLM-powered Gaussian Splatting in VR interactions},
author = {H. Mao and Z. Xu and S. Wei and Y. Quan and N. Deng and X. Yang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005148017&doi=10.1109%2fVRW66409.2025.00472&partnerID=40&md5=ee725f655a37251ff335ad2098d15f22},
doi = {10.1109/VRW66409.2025.00472},
isbn = {979-833151484-6 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces Abstr. Workshops, VRW},
pages = {1654–1655},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Recent advances in radiance field rendering, particularly 3D Gaussian Splatting (3DGS), have demonstrated significant potential for VR content creation, offering both high-quality rendering and an efficient production pipeline. However, current physics-based interaction systems for 3DGS are limited to either simplistic, unrealistic simulations or require substantial user input for complex scenes, largely due to the lack of scene comprehension. In this demonstration, we present a highly realistic interactive VR system powered by large language models (LLMs). After object-aware GS reconstruction, we prompt GPT-4o to analyze the physical properties of objects in the scene, which then guide physical simulations that adhere to real-world phenomena. Additionally, We design a GPT-assisted GS inpainting module to complete the areas occluded by manipulated objects. To facilitate rich interaction, we introduce a computationally efficient physical simulation framework through a PBD-based unified interpolation method, which supports various forms of physical interactions. In our research demonstrations, we reconstruct varieties of scenes enhanced by LLM's understanding, showcasing how our VR system can support complex, realistic interactions without additional manual design or annotation. © 2025 IEEE.},
keywords = {3D Gaussian Splatting, 3D reconstruction, Content creation, Digital elevation model, Gaussians, High quality, Language Model, material analysis, Materials analysis, Physical simulation, Quality rendering, Rendering (computer graphics), Splatting, Virtual Reality, Volume Rendering, VR systems},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, J.; Grubert, J.; Kristensson, P. O.
Analyzing Multimodal Interaction Strategies for LLM-Assisted Manipulation of 3D Scenes Proceedings Article
In: Proc. - IEEE Conf. Virtual Real. 3D User Interfaces, VR, pp. 206–216, Institute of Electrical and Electronics Engineers Inc., 2025, ISBN: 979-833153645-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D reconstruction, 3D scene editing, 3D scenes, Computer simulation languages, Editing systems, Immersive environment, Interaction pattern, Interaction strategy, Language Model, Large language model, large language models, Multimodal Interaction, Scene editing, Three dimensional computer graphics, Virtual environments, Virtual Reality
@inproceedings{chen_analyzing_2025,
title = {Analyzing Multimodal Interaction Strategies for LLM-Assisted Manipulation of 3D Scenes},
author = {J. Chen and J. Grubert and P. O. Kristensson},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105002716635&doi=10.1109%2fVR59515.2025.00045&partnerID=40&md5=306aa7fbb3dad0aa9d43545f3c7eb9ea},
doi = {10.1109/VR59515.2025.00045},
isbn = {979-833153645-9 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Proc. - IEEE Conf. Virtual Real. 3D User Interfaces, VR},
pages = {206–216},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {As more applications of large language models (LLMs) for 3D content in immersive environments emerge, it is crucial to study user behavior to identify interaction patterns and potential barriers to guide the future design of immersive content creation and editing systems which involve LLMs. In an empirical user study with 12 participants, we combine quantitative usage data with post-experience questionnaire feedback to reveal common interaction patterns and key barriers in LLM-assisted 3D scene editing systems. We identify opportunities for improving natural language interfaces in 3D design tools and propose design recommendations. Through an empirical study, we demonstrate that LLM-assisted interactive systems can be used productively in immersive environments. © 2025 IEEE.},
keywords = {3D modeling, 3D reconstruction, 3D scene editing, 3D scenes, Computer simulation languages, Editing systems, Immersive environment, Interaction pattern, Interaction strategy, Language Model, Large language model, large language models, Multimodal Interaction, Scene editing, Three dimensional computer graphics, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Leininger, P.; Weber, C. J.; Rothe, S.
Understanding Creative Potential and Use Cases of AI-Generated Environments for Virtual Film Productions: Insights from Industry Professionals Proceedings Article
In: IMX - Proc. ACM Int. Conf. Interact. Media Experiences, pp. 60–78, Association for Computing Machinery, Inc, 2025, ISBN: 979-840071391-0 (ISBN).
Abstract | Links | BibTeX | Tags: 3-D environments, 3D reconstruction, 3D Scene Reconstruction, 3d scenes reconstruction, AI-generated 3d environment, AI-Generated 3D Environments, Computer interaction, Creative Collaboration, Creatives, Digital content creation, Digital Content Creation., Filmmaking workflow, Filmmaking Workflows, Gaussian distribution, Gaussian Splatting, Gaussians, Generative AI, Graphical user interface, Graphical User Interface (GUI), Graphical user interfaces, Human computer interaction, human-computer interaction, Human-Computer Interaction (HCI), Immersive, Immersive Storytelling, Interactive computer graphics, Interactive computer systems, Interactive media, Mesh generation, Previsualization, Real-Time Rendering, Splatting, Three dimensional computer graphics, Virtual production, Virtual Production (VP), Virtual Reality, Work-flows
@inproceedings{leininger_understanding_2025,
title = {Understanding Creative Potential and Use Cases of AI-Generated Environments for Virtual Film Productions: Insights from Industry Professionals},
author = {P. Leininger and C. J. Weber and S. Rothe},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105007976841&doi=10.1145%2f3706370.3727853&partnerID=40&md5=0d4cf7a2398d12d04e4f0ab182474a10},
doi = {10.1145/3706370.3727853},
isbn = {979-840071391-0 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {IMX - Proc. ACM Int. Conf. Interact. Media Experiences},
pages = {60–78},
publisher = {Association for Computing Machinery, Inc},
abstract = {Virtual production (VP) is transforming filmmaking by integrating real-time digital elements with live-action footage, offering new creative possibilities and streamlined workflows. While industry experts recognize AI's potential to revolutionize VP, its practical applications and value across different production phases and user groups remain underexplored. Building on initial research into generative and data-driven approaches, this paper presents the first systematic pilot study evaluating three types of AI-generated 3D environments - Depth Mesh, 360° Panoramic Meshes, and Gaussian Splatting - through the participation of 15 filmmaking professionals from diverse roles. Unlike commonly used 2D AI-generated visuals, our approach introduces navigable 3D environments that offer greater control and flexibility, aligning more closely with established VP workflows. Through expert interviews and literature research, we developed evaluation criteria to assess their usefulness beyond concept development, extending to previsualization, scene exploration, and interdisciplinary collaboration. Our findings indicate that different environments cater to distinct production needs, from early ideation to detailed visualization. Gaussian Splatting proved effective for high-fidelity previsualization, while 360° Panoramic Meshes excelled in rapid concept ideation. Despite their promise, challenges such as limited interactivity and customization highlight areas for improvement. Our prototype, EnVisualAIzer, built in Unreal Engine 5, provides an accessible platform for diverse filmmakers to engage with AI-generated environments, fostering a more inclusive production process. By lowering technical barriers, these environments have the potential to make advanced VP tools more widely available. This study offers valuable insights into the evolving role of AI in VP and sets the stage for future research and development. © 2025 Copyright held by the owner/author(s). Publication rights licensed to ACM.},
keywords = {3-D environments, 3D reconstruction, 3D Scene Reconstruction, 3d scenes reconstruction, AI-generated 3d environment, AI-Generated 3D Environments, Computer interaction, Creative Collaboration, Creatives, Digital content creation, Digital Content Creation., Filmmaking workflow, Filmmaking Workflows, Gaussian distribution, Gaussian Splatting, Gaussians, Generative AI, Graphical user interface, Graphical User Interface (GUI), Graphical user interfaces, Human computer interaction, human-computer interaction, Human-Computer Interaction (HCI), Immersive, Immersive Storytelling, Interactive computer graphics, Interactive computer systems, Interactive media, Mesh generation, Previsualization, Real-Time Rendering, Splatting, Three dimensional computer graphics, Virtual production, Virtual Production (VP), Virtual Reality, Work-flows},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
Kim, S. J.; Cao, D. D.; Spinola, F.; Lee, S. J.; Cho, K. S.
RoomRecon: High-Quality Textured Room Layout Reconstruction on Mobile Devices Proceedings Article
In: U., Eck; M., Sra; J., Stefanucci; M., Sugimoto; M., Tatzgern; I., Williams (Ed.): Proc. - IEEE Int. Symp. Mixed Augment. Real., ISMAR, pp. 544–553, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 979-833151647-5 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D models, 3D reconstruction, 3d-modeling, AR-assisted image capturing, Architectural design, Augmented Reality, Augmented reality-assisted image capturing, Image capturing, Indoor 3D reconstruction, Indoor space, Mobile application, Mobile Applications, Mortar, Room layout, Texturing, Texturing quality
@inproceedings{kim_roomrecon_2024,
title = {RoomRecon: High-Quality Textured Room Layout Reconstruction on Mobile Devices},
author = {S. J. Kim and D. D. Cao and F. Spinola and S. J. Lee and K. S. Cho},
editor = {Eck U. and Sra M. and Stefanucci J. and Sugimoto M. and Tatzgern M. and Williams I.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85213494599&doi=10.1109%2fISMAR62088.2024.00069&partnerID=40&md5=0f6b9d4c44d9c55cafba7ad76651ea07},
doi = {10.1109/ISMAR62088.2024.00069},
isbn = {979-833151647-5 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {Proc. - IEEE Int. Symp. Mixed Augment. Real., ISMAR},
pages = {544–553},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {Widespread RGB-Depth (RGB-D) sensors and advanced 3D reconstruction technologies facilitate the capture of indoor spaces, improving the fields of augmented reality (AR), virtual reality (VR), and extended reality (XR). Nevertheless, current technologies still face limitations, such as the inability to reflect minor scene changes without a complete recapture, the lack of semantic scene understanding, and various texturing challenges that affect the 3D model's visual quality. These issues affect the realism required for VR experiences and other applications such as in interior design and real estate. To address these challenges, we introduce RoomRecon, an interactive, real-time scanning and texturing pipeline for 3D room models. We propose a two-phase texturing pipeline that integrates AR-guided image capturing for texturing and generative AI models to improve texturing quality and provide better replicas of indoor spaces. Moreover, we suggest to focus only on permanent room elements such as walls, floors, and ceilings, to allow for easily customizable 3D models. We conduct experiments in a variety of indoor spaces to assess the texturing quality and speed of our method. The quantitative results and user study demonstrate that RoomRecon surpasses state-of-the-art methods in terms of texturing quality and on-device computation time. © 2024 IEEE.},
keywords = {3D modeling, 3D models, 3D reconstruction, 3d-modeling, AR-assisted image capturing, Architectural design, Augmented Reality, Augmented reality-assisted image capturing, Image capturing, Indoor 3D reconstruction, Indoor space, Mobile application, Mobile Applications, Mortar, Room layout, Texturing, Texturing quality},
pubstate = {published},
tppubtype = {inproceedings}
}
Rausa, M.; Gaglio, S.; Augello, A.; Caggianese, G.; Franchini, S.; Gallo, L.; Sabatucci, L.
Enriching Metaverse with Memories Through Generative AI: A Case Study Proceedings Article
In: IEEE Int. Conf. Metrol. Ext. Real., Artif. Intell. Neural Eng., MetroXRAINE - Proc., pp. 371–376, Institute of Electrical and Electronics Engineers Inc., St Albans, United Kingdom, 2024, ISBN: 979-835037800-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D models, 3D reconstruction, 3d-modeling, Case-studies, Generative adversarial networks, Generative AI, Input modes, Metamemory, Metaverses, Synthetic Data Generation, Synthetic data generations, Textual description, Virtual environments, Virtual Reality
@inproceedings{rausa_enriching_2024,
title = {Enriching Metaverse with Memories Through Generative AI: A Case Study},
author = {M. Rausa and S. Gaglio and A. Augello and G. Caggianese and S. Franchini and L. Gallo and L. Sabatucci},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85216124702&doi=10.1109%2fMetroXRAINE62247.2024.10796338&partnerID=40&md5=580d0727ab8740a6ada62eeef5ac283f},
doi = {10.1109/MetroXRAINE62247.2024.10796338},
isbn = {979-835037800-9 (ISBN)},
year = {2024},
date = {2024-01-01},
urldate = {2025-01-07},
booktitle = {IEEE Int. Conf. Metrol. Ext. Real., Artif. Intell. Neural Eng., MetroXRAINE - Proc.},
pages = {371–376},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
address = {St Albans, United Kingdom},
abstract = {The paper introduces MetaMemory, an approach to generate 3D models from either textual descriptions or photographs of objects, offering dual input modes for enhanced representation. MetaMemory's architecture is discussed presenting the tools employed in extracting the object from the image, generating the 3D mesh from texts or images, and visualizing the object reconstruction in an immersive scenario. Afterwards, a case study in which we experienced reconstructing memories of ancient crafts is examined together with the achieved results, by highlighting current limitations and potential applications. © 2024 IEEE.},
keywords = {3D modeling, 3D models, 3D reconstruction, 3d-modeling, Case-studies, Generative adversarial networks, Generative AI, Input modes, Metamemory, Metaverses, Synthetic Data Generation, Synthetic data generations, Textual description, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Xie, W.; Liu, Y.; Wang, K.; Wang, M.
LLM-Guided Cross-Modal Point Cloud Quality Assessment: A Graph Learning Approach Journal Article
In: IEEE Signal Processing Letters, vol. 31, pp. 2250–2254, 2024, ISSN: 10709908 (ISSN).
Abstract | Links | BibTeX | Tags: 3D reconstruction, Cross-modal, Language Model, Large language model, Learning approach, Multi-modal, Multimodal quality assessment, Point cloud quality assessment, Point-clouds, Quality assessment
@article{xie_llm-guided_2024,
title = {LLM-Guided Cross-Modal Point Cloud Quality Assessment: A Graph Learning Approach},
author = {W. Xie and Y. Liu and K. Wang and M. Wang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85203417746&doi=10.1109%2fLSP.2024.3452556&partnerID=40&md5=88460ec3043fa9161c4d5dd6fc282f95},
doi = {10.1109/LSP.2024.3452556},
issn = {10709908 (ISSN)},
year = {2024},
date = {2024-01-01},
journal = {IEEE Signal Processing Letters},
volume = {31},
pages = {2250–2254},
abstract = {This paper addresses the critical need for accurate and reliable point cloud quality assessment (PCQA) in various applications, such as autonomous driving, robotics, virtual reality, and 3D reconstruction. To meet this need, we propose a large language model (LLM)-guided PCQA approach based on graph learning. Specifically, we first utilize the LLM to generate quality description texts for each 3D object, and employ two CLIP-like feature encoders to represent the image and text modalities. Next, we design a latent feature enhancer module to improve contrastive learning, enabling more effective alignment performance. Finally, we develop a graph network fusion module that utilizes a ranking-based loss to adjust the relationship of different nodes, which explicitly considers both modality fusion and quality ranking. Experimental results on three benchmark datasets demonstrate the effectiveness and superiority of our approach over 12 representative PCQA methods, which demonstrate the potential of multi-modal learning, the importance of latent feature enhancement, and the significance of graph-based fusion in advancing the field of PCQA. © 2024 IEEE.},
keywords = {3D reconstruction, Cross-modal, Language Model, Large language model, Learning approach, Multi-modal, Multimodal quality assessment, Point cloud quality assessment, Point-clouds, Quality assessment},
pubstate = {published},
tppubtype = {article}
}
Federico, G.; Carrara, F.; Amato, G.; Benedetto, M. Di
Spatio-Temporal 3D Reconstruction from Frame Sequences and Feature Points Proceedings Article
In: ACM Int. Conf. Proc. Ser., pp. 52–64, Association for Computing Machinery, 2024, ISBN: 979-840071794-9 (ISBN).
Abstract | Links | BibTeX | Tags: 3D reconstruction, Adversarial machine learning, Artificial intelligence, Color motion pictures, Color photography, Contrastive Learning, De-noising, Deep learning, Denoising Diffusion Probabilistic Model, Frame features, machine learning, Machine-learning, Probabilistic models, Signed Distance Field, Signed distance fields, Spatio-temporal, Video Reconstruction, Video streaming
@inproceedings{federico_spatio-temporal_2024,
title = {Spatio-Temporal 3D Reconstruction from Frame Sequences and Feature Points},
author = {G. Federico and F. Carrara and G. Amato and M. Di Benedetto},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85203128613&doi=10.1145%2f3672406.3672415&partnerID=40&md5=2a0dc51baa15f0dcd7f9d2cca708ec15},
doi = {10.1145/3672406.3672415},
isbn = {979-840071794-9 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {ACM Int. Conf. Proc. Ser.},
pages = {52–64},
publisher = {Association for Computing Machinery},
abstract = {Reconstructing a large real environment is a fundamental task to promote eXtended Reality adoption in industrial and entertainment fields. However, the short range of depth cameras, the sparsity of LiDAR sensors, and the huge computational cost of Structure-from-Motion pipelines prevent scene replication in near real time. To overcome these limitations, we introduce a spatio-temporal diffusion neural architecture, a generative AI technique that fuses temporal information (i.e., a short temporally-ordered list of color photographs, like sparse frames of a video stream) with an approximate spatial resemblance of the explored environment. Our aim is to modify an existing 3D diffusion neural model to produce a Signed Distance Field volume from which a 3D mesh representation can be extracted. Our results show that the hallucination approach of diffusion models is an effective methodology where a fast reconstruction is a crucial target. © 2024 Owner/Author.},
keywords = {3D reconstruction, Adversarial machine learning, Artificial intelligence, Color motion pictures, Color photography, Contrastive Learning, De-noising, Deep learning, Denoising Diffusion Probabilistic Model, Frame features, machine learning, Machine-learning, Probabilistic models, Signed Distance Field, Signed distance fields, Spatio-temporal, Video Reconstruction, Video streaming},
pubstate = {published},
tppubtype = {inproceedings}
}