AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Sinha, Y.; Shanmugam, S.; Sahu, Y. K.; Mukhopadhyay, A.; Biswas, P.
Diffuse Your Data Blues: Augmenting Low-Resource Datasets via User-Assisted Diffusion Proceedings Article
In: Int Conf Intell User Interfaces Proc IUI, pp. 538–552, Association for Computing Machinery, 2025, ISBN: 9798400713064 (ISBN).
Abstract | Links | BibTeX | Tags: Data gathering, Detection models, Diffusion Model, diffusion models, Efficient Augmentation, Image Composition, Industrial context, Mixed reality, Object Detection, Objects detection, Synthetic Dataset, Synthetic datasets, Training objects
@inproceedings{sinha_diffuse_2025,
title = {Diffuse Your Data Blues: Augmenting Low-Resource Datasets via User-Assisted Diffusion},
author = {Y. Sinha and S. Shanmugam and Y. K. Sahu and A. Mukhopadhyay and P. Biswas},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001924293&doi=10.1145%2F3708359.3712163&partnerID=40&md5=8de7dcc94f2b2ad9e8d6de168ba05840},
doi = {10.1145/3708359.3712163},
isbn = {9798400713064 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Int Conf Intell User Interfaces Proc IUI},
pages = {538–552},
publisher = {Association for Computing Machinery},
abstract = {Mixed reality applications in industrial contexts necessitate extensive and varied datasets for training object detection models, yet actual data gathering may be obstructed by logistical or cost issues. This study investigates the implementation of generative AI methods to work on this issue for mixed reality applications, with an emphasis on assembly and disassembly tasks. The novel objects found in industrial settings are difficult to describe using words, making text-based models less effective. In this study, a diffusion model is used to generate images by combining novel objects with various backgrounds. The backgrounds are selected where object detection in specific applications has been ineffective. This approach efficiently produces a diverse range of training samples. We compare three approaches: traditional augmentation methods, GAN-based augmentation, and Diffusion-based augmentation. Results show that the diffusion model significantly improved detection metrics. For instance, applying diffusion models to the dataset containing mechanical components of a pneumatic cylinder raised the F1 Score from 69.77 to 84.21 and the mAP@50 from 76.48 to 88.77, resulting in an increase in object detection performance, with a 67% less dataset size compared to the traditional augmented dataset. The proposed image composition diffusion model and user-friendly interface further simplify dataset enrichment, proving effective for augmenting data and improving the robustness of detection models. © 2025 Elsevier B.V., All rights reserved.},
keywords = {Data gathering, Detection models, Diffusion Model, diffusion models, Efficient Augmentation, Image Composition, Industrial context, Mixed reality, Object Detection, Objects detection, Synthetic Dataset, Synthetic datasets, Training objects},
pubstate = {published},
tppubtype = {inproceedings}
}
Pielage, L.; Schmidle, P.; Marschall, B.; Risse, B.
Interactive High-Quality Skin Lesion Generation using Diffusion Models for VR-based Dermatological Education Proceedings Article
In: Int Conf Intell User Interfaces Proc IUI, pp. 878–897, Association for Computing Machinery, 2025, ISBN: 9798400713064 (ISBN).
Abstract | Links | BibTeX | Tags: Deep learning, Dermatology, Diffusion Model, diffusion models, Digital elevation model, Generative AI, Graphical user interfaces, Guidance Strategies, Guidance strategy, Image generation, Image generations, Inpainting, Interactive Generation, Medical education, Medical Imaging, Simulation training, Skin lesion, Upsampling, Virtual environments, Virtual Reality
@inproceedings{pielage_interactive_2025,
title = {Interactive High-Quality Skin Lesion Generation using Diffusion Models for VR-based Dermatological Education},
author = {L. Pielage and P. Schmidle and B. Marschall and B. Risse},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105001923208&doi=10.1145%2F3708359.3712101&partnerID=40&md5=d15d791e0d786bee2da91553da332ca3},
doi = {10.1145/3708359.3712101},
isbn = {9798400713064 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Int Conf Intell User Interfaces Proc IUI},
pages = {878–897},
publisher = {Association for Computing Machinery},
abstract = {Malignant melanoma is one of the most lethal forms of cancer when not detected early. As a result, cancer screening programs have been implemented internationally, all of which require visual inspection of skin lesions. Early melanoma detection is a crucial competence in medical and dermatological education, and it is primarily trained using 2D imagery. However, given the intrinsic 3D nature of skin lesions and the importance of incorporating additional contextual information about the patient (e.g., skin type, nearby lesions, etc.), this approach falls short of providing a comprehensive and scalable learning experience. A potential solution is the use of Virtual Reality (VR) scenarios, which can offer an effective strategy to train skin cancer screenings in a realistic 3D setting, thereby enhancing medical students' awareness of early melanoma detection. In this paper, we present a comprehensive pipeline and models for generating malignant melanomas and benign nevi, which can be utilized in VR-based medical training. We use diffusion models for the generation of skin lesions, which we have enhanced with various guiding strategies to give educators maximum flexibility in designing scenarios and seamlessly placing lesions on virtual agents. Additionally, we have developed a tool which comprises a graphical user interface (GUI) enabling the generation of new lesions and adapting existing ones using an intuitive and interactive inpainting strategy. The tool also offers a novel custom upsampling strategy to achieve a sufficient resolution required for diagnostic purposes. The generated skin lesions have been validated in a user study with trained dermatologists, confirming the overall high quality of the generated lesions and the utility for educational purposes. © 2025 Elsevier B.V., All rights reserved.},
keywords = {Deep learning, Dermatology, Diffusion Model, diffusion models, Digital elevation model, Generative AI, Graphical user interfaces, Guidance Strategies, Guidance strategy, Image generation, Image generations, Inpainting, Interactive Generation, Medical education, Medical Imaging, Simulation training, Skin lesion, Upsampling, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}
Saddik, A. El; Ahmad, J.; Khan, M.; Abouzahir, S.; Gueaieb, W.
Unleashing Creativity in the Metaverse: Generative AI and Multimodal Content Journal Article
In: ACM Transactions on Multimedia Computing, Communications and Applications, vol. 21, no. 7, pp. 1–43, 2025, ISSN: 15516857 (ISSN); 15516865 (ISSN), (Publisher: Association for Computing Machinery).
Abstract | Links | BibTeX | Tags: Adversarial networks, Artificial intelligence, Content generation, Context information, Creatives, Diffusion Model, diffusion models, Generative adversarial networks, Generative AI, Human engineering, Information instructions, Interactive computer graphics, Interactive computer systems, Interactive devices, Interoperability, Metaverse, Metaverses, Multi-modal, multimodal, Simple++, Three dimensional computer graphics, user experience, User interfaces, Virtual Reality
@article{el_saddik_unleashing_2025,
title = {Unleashing Creativity in the Metaverse: Generative AI and Multimodal Content},
author = {A. El Saddik and J. Ahmad and M. Khan and S. Abouzahir and W. Gueaieb},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105011860002&doi=10.1145%2F3713075&partnerID=40&md5=20064843ced240c42e9353d747672cb3},
doi = {10.1145/3713075},
issn = {15516857 (ISSN); 15516865 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {ACM Transactions on Multimedia Computing, Communications and Applications},
volume = {21},
number = {7},
pages = {1–43},
abstract = {The metaverse presents an emerging creative expression and collaboration frontier where generative artificial intelligence (GenAI) can play a pivotal role with its ability to generate multimodal content from simple prompts. These prompts allow the metaverse to interact with GenAI, where context information, instructions, input data, or even output indications constituting the prompt can come from within the metaverse. However, their integration poses challenges regarding interoperability, lack of standards, scalability, and maintaining a high-quality user experience. This article explores how GenAI can productively assist in enhancing creativity within the contexts of the metaverse and unlock new opportunities. We provide a technical, in-depth overview of the different generative models for image, video, audio, and 3D content within the metaverse environments. We also explore the bottlenecks, opportunities, and innovative applications of GenAI from the perspectives of end users, developers, service providers, and AI researchers. This survey commences by highlighting the potential of GenAI for enhancing the metaverse experience through dynamic content generation to populate massive virtual worlds. Subsequently, we shed light on the ongoing research practices and trends in multimodal content generation, enhancing realism and creativity and alleviating bottlenecks related to standardization, computational cost, privacy, and safety. Last, we share insights into promising research directions toward the integration of GenAI with the metaverse for creative enhancement, improved immersion, and innovative interactive applications. © 2025 Elsevier B.V., All rights reserved.},
note = {Publisher: Association for Computing Machinery},
keywords = {Adversarial networks, Artificial intelligence, Content generation, Context information, Creatives, Diffusion Model, diffusion models, Generative adversarial networks, Generative AI, Human engineering, Information instructions, Interactive computer graphics, Interactive computer systems, Interactive devices, Interoperability, Metaverse, Metaverses, Multi-modal, multimodal, Simple++, Three dimensional computer graphics, user experience, User interfaces, Virtual Reality},
pubstate = {published},
tppubtype = {article}
}
2024
Jayaraman, S.; Bhavya, R.; Srihari, V.; Rajam, V. Mary Anita
TexAVi: Generating Stereoscopic VR Video Clips from Text Descriptions Proceedings Article
In: IEEE Int. Conf. Comput. Vis. Mach. Intell., CVMI, Institute of Electrical and Electronics Engineers Inc., 2024, ISBN: 9798350376876 (ISBN).
Abstract | Links | BibTeX | Tags: Adversarial networks, Computer simulation languages, Deep learning, Depth Estimation, Depth perception, Diffusion Model, diffusion models, Digital elevation model, Generative adversarial networks, Generative model, Generative systems, Language Model, Motion capture, Stereo image processing, Text-to-image, Training data, Video analysis, Video-clips, Virtual environments, Virtual Reality
@inproceedings{jayaraman_texavi_2024,
title = {TexAVi: Generating Stereoscopic VR Video Clips from Text Descriptions},
author = {S. Jayaraman and R. Bhavya and V. Srihari and V. Mary Anita Rajam},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85215265234&doi=10.1109%2FCVMI61877.2024.10782691&partnerID=40&md5=21e6ecfcc0710c036ba93e39b5fcd30d},
doi = {10.1109/CVMI61877.2024.10782691},
isbn = {9798350376876 (ISBN)},
year = {2024},
date = {2024-01-01},
booktitle = {IEEE Int. Conf. Comput. Vis. Mach. Intell., CVMI},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
abstract = {While generative models such as text-to-image, large language models and text-to-video have seen significant progress, the extension to text-to-virtual-reality remains largely unexplored, due to a deficit in training data and the complexity of achieving realistic depth and motion in virtual environments. This paper proposes an approach to coalesce existing generative systems to form a stereoscopic virtual reality video from text. Carried out in three main stages, we start with a base text-to-image model that captures context from an input text. We then employ Stable Diffusion on the rudimentary image produced, to generate frames with enhanced realism and overall quality. These frames are processed with depth estimation algorithms to create left-eye and right-eye views, which are stitched side-by-side to create an immersive viewing experience. Such systems would be highly beneficial in virtual reality production, since filming and scene building often require extensive hours of work and post-production effort. We utilize image evaluation techniques, specifically Fréchet Inception Distance and CLIP Score, to assess the visual quality of frames produced for the video. These quantitative measures establish the proficiency of the proposed method. Our work highlights the exciting possibilities of using natural language-driven graphics in fields like virtual reality simulations. © 2025 Elsevier B.V., All rights reserved.},
keywords = {Adversarial networks, Computer simulation languages, Deep learning, Depth Estimation, Depth perception, Diffusion Model, diffusion models, Digital elevation model, Generative adversarial networks, Generative model, Generative systems, Language Model, Motion capture, Stereo image processing, Text-to-image, Training data, Video analysis, Video-clips, Virtual environments, Virtual Reality},
pubstate = {published},
tppubtype = {inproceedings}
}