AHCI RESEARCH GROUP
Publications
Papers published in international journals,
proceedings of conferences, workshops and books.
OUR RESEARCH
Scientific Publications
How to
You can use the tag cloud to select only the papers dealing with specific research topics.
You can expand the Abstract, Links and BibTex record for each paper.
2025
Zeng, S. -Y.; Liang, T. -Y.
PartConverter: A Part-Oriented Transformation Framework for Point Clouds Journal Article
In: IET Image Processing, vol. 19, no. 1, 2025, ISSN: 17519659 (ISSN).
Abstract | Links | BibTeX | Tags: 3D modeling, 3D models, 3d-modeling, Adversarial networks, attention mechanism, Attention mechanisms, Auto encoders, Cloud transformations, Generative Adversarial Network, Part assembler, Part-oriented, Point cloud transformation, Point-clouds
@article{zeng_partconverter_2025,
title = {PartConverter: A Part-Oriented Transformation Framework for Point Clouds},
author = {S. -Y. Zeng and T. -Y. Liang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105005775417&doi=10.1049%2fipr2.70104&partnerID=40&md5=1ee3178fd6b4a03bc7e299e1292e9694},
doi = {10.1049/ipr2.70104},
issn = {17519659 (ISSN)},
year = {2025},
date = {2025-01-01},
journal = {IET Image Processing},
volume = {19},
number = {1},
abstract = {With generative AI technologies advancing rapidly, the capabilities for 3D model generation and transformation are expanding across industries like manufacturing, healthcare, and virtual reality. However, existing methods based on generative adversarial networks (GANs), autoencoders, or transformers still have notable limitations. They primarily generate entire objects without providing flexibility for independent part transformation or precise control over model components. These constraints pose challenges for applications requiring complex object manipulation and fine-grained adjustments. To overcome these limitations, we propose PartConverter, a novel part-oriented point cloud transformation framework emphasizing flexibility and precision in 3D model transformations. PartConverter leverages attention mechanisms and autoencoders to capture crucial details within each part while modeling the relationships between components, thereby enabling highly customizable, part-wise transformations that maintain overall consistency. Additionally, our part assembler ensures that transformed parts align coherently, resulting in a consistent and realistic final 3D shape. This framework significantly enhances control over detailed part modeling, increasing the flexibility and efficiency of 3D model transformation workflows. © 2025 The Author(s). IET Image Processing published by John Wiley & Sons Ltd on behalf of The Institution of Engineering and Technology.},
keywords = {3D modeling, 3D models, 3d-modeling, Adversarial networks, attention mechanism, Attention mechanisms, Auto encoders, Cloud transformations, Generative Adversarial Network, Part assembler, Part-oriented, Point cloud transformation, Point-clouds},
pubstate = {published},
tppubtype = {article}
}
Zhang, G.; Wang, Y.; Luo, C.; Xu, S.; Ming, Y.; Peng, J.; Zhang, M.
Visual Harmony: LLM’s Power in Crafting Coherent Indoor Scenes from Images Proceedings Article
In: Z., Lin; H., Zha; M.-M., Cheng; R., He; C.-L., Liu; K., Ubul; W., Silamu; J., Zhou (Ed.): Lect. Notes Comput. Sci., pp. 3–17, Springer Science and Business Media Deutschland GmbH, 2025, ISBN: 03029743 (ISSN); 978-981978507-0 (ISBN).
Abstract | Links | BibTeX | Tags: Augmented Reality, Depth perception, Indoor scene generation, Input image, Language Model, Large language model, Metaverses, Point-clouds, Power, Scene completion, Scene Generation, Scene-graphs, Semantic Segmentation, Semantics, Virtual Reality, Visual languages
@inproceedings{zhang_visual_2025,
title = {Visual Harmony: LLM’s Power in Crafting Coherent Indoor Scenes from Images},
author = {G. Zhang and Y. Wang and C. Luo and S. Xu and Y. Ming and J. Peng and M. Zhang},
editor = {Lin Z. and Zha H. and Cheng M.-M. and He R. and Liu C.-L. and Ubul K. and Silamu W. and Zhou J.},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85209374797&doi=10.1007%2f978-981-97-8508-7_1&partnerID=40&md5=5231ab0bce95fb3f09db80392acd58ff},
doi = {10.1007/978-981-97-8508-7_1},
isbn = {03029743 (ISSN); 978-981978507-0 (ISBN)},
year = {2025},
date = {2025-01-01},
booktitle = {Lect. Notes Comput. Sci.},
volume = {15036 LNCS},
pages = {3–17},
publisher = {Springer Science and Business Media Deutschland GmbH},
abstract = {Indoor scene generation has recently attracted significant attention as it is crucial for metaverse, 3D animation, visual effects in movies, and virtual/augmented reality. Existing image-based indoor scene generation methods often produce scenes that are not realistic enough, with issues such as floating objects, incorrect object orientations, and incomplete scenes that only include the part of the scenes captured by the input image. To address these challenges, we propose Visual Harmony, a method that leverages the powerful spatial imagination capabilities of Large Language Model (LLM) to generate corresponding indoor scenes based on the input image. Specifically, we first extract information from the input image through depth estimation and panorama segmentation, reconstructing a semantic point cloud. Using this reconstructed semantic point cloud, we extract a scene graph that describes only the objects in the image. Then we leverage the strong spatial imagination capabilities of LLM to complete the scene graph, forming a representation of a complete room scene. Based on this fine scene graph, we can generate entire indoor scene that includes both the captured and not captured parts of the input image. Extensive experiments demonstrate that our method can generate realistic, plausible, and highly relevant complete indoor scenes related to the input image. © The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2025.},
keywords = {Augmented Reality, Depth perception, Indoor scene generation, Input image, Language Model, Large language model, Metaverses, Point-clouds, Power, Scene completion, Scene Generation, Scene-graphs, Semantic Segmentation, Semantics, Virtual Reality, Visual languages},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
Chen, M.; Liu, M.; Wang, C.; Song, X.; Zhang, Z.; Xie, Y.; Wang, L.
Cross-Modal Graph Semantic Communication Assisted by Generative AI in the Metaverse for 6G Journal Article
In: Research, vol. 7, 2024, ISSN: 20965168 (ISSN).
Abstract | Links | BibTeX | Tags: 3-dimensional, 3Dimensional models, Cross-modal, Graph neural networks, Graph semantics, Metaverses, Multi-modal data, Point-clouds, Semantic communication, Semantic features, Semantics, Three dimensional computer graphics, Virtual scenario
@article{chen_cross-modal_2024,
title = {Cross-Modal Graph Semantic Communication Assisted by Generative AI in the Metaverse for 6G},
author = {M. Chen and M. Liu and C. Wang and X. Song and Z. Zhang and Y. Xie and L. Wang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85192245049&doi=10.34133%2fresearch.0342&partnerID=40&md5=4a1c3e0a3ac877fcdf04937a96da32a1},
doi = {10.34133/research.0342},
issn = {20965168 (ISSN)},
year = {2024},
date = {2024-01-01},
journal = {Research},
volume = {7},
abstract = {Recently, the development of the Metaverse has become a frontier spotlight, which is an important demonstration of the integration innovation of advanced technologies in the Internet. Moreover, artificial intelligence (AI) and 6G communications will be widely used in our daily lives. However, the effective interactions with the representations of multimodal data among users via 6G communications is the main challenge in the Metaverse. In this work, we introduce an intelligent cross-modal graph semantic communication approach based on generative AI and 3-dimensional (3D) point clouds to improve the diversity of multimodal representations in the Metaverse. Using a graph neural network, multimodal data can be recorded by key semantic features related to the real scenarios. Then, we compress the semantic features using a graph transformer encoder at the transmitter, which can extract the semantic representations through the cross-modal attention mechanisms. Next, we leverage a graph semantic validation mechanism to guarantee the exactness of the overall data at the receiver. Furthermore, we adopt generative AI to regenerate multimodal data in virtual scenarios. Simultaneously, a novel 3D generative reconstruction network is constructed from the 3D point clouds, which can transfer the data from images to 3D models, and we infer the multimodal data into the 3D models to increase realism in virtual scenarios. Finally, the experiment results demonstrate that cross-modal graph semantic communication, assisted by generative AI, has substantial potential for enhancing user interactions in the 6G communications and Metaverse. Copyright © 2024 Mingkai Chen et al.},
keywords = {3-dimensional, 3Dimensional models, Cross-modal, Graph neural networks, Graph semantics, Metaverses, Multi-modal data, Point-clouds, Semantic communication, Semantic features, Semantics, Three dimensional computer graphics, Virtual scenario},
pubstate = {published},
tppubtype = {article}
}
Xie, W.; Liu, Y.; Wang, K.; Wang, M.
LLM-Guided Cross-Modal Point Cloud Quality Assessment: A Graph Learning Approach Journal Article
In: IEEE Signal Processing Letters, vol. 31, pp. 2250–2254, 2024, ISSN: 10709908 (ISSN).
Abstract | Links | BibTeX | Tags: 3D reconstruction, Cross-modal, Language Model, Large language model, Learning approach, Multi-modal, Multimodal quality assessment, Point cloud quality assessment, Point-clouds, Quality assessment
@article{xie_llm-guided_2024,
title = {LLM-Guided Cross-Modal Point Cloud Quality Assessment: A Graph Learning Approach},
author = {W. Xie and Y. Liu and K. Wang and M. Wang},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85203417746&doi=10.1109%2fLSP.2024.3452556&partnerID=40&md5=88460ec3043fa9161c4d5dd6fc282f95},
doi = {10.1109/LSP.2024.3452556},
issn = {10709908 (ISSN)},
year = {2024},
date = {2024-01-01},
journal = {IEEE Signal Processing Letters},
volume = {31},
pages = {2250–2254},
abstract = {This paper addresses the critical need for accurate and reliable point cloud quality assessment (PCQA) in various applications, such as autonomous driving, robotics, virtual reality, and 3D reconstruction. To meet this need, we propose a large language model (LLM)-guided PCQA approach based on graph learning. Specifically, we first utilize the LLM to generate quality description texts for each 3D object, and employ two CLIP-like feature encoders to represent the image and text modalities. Next, we design a latent feature enhancer module to improve contrastive learning, enabling more effective alignment performance. Finally, we develop a graph network fusion module that utilizes a ranking-based loss to adjust the relationship of different nodes, which explicitly considers both modality fusion and quality ranking. Experimental results on three benchmark datasets demonstrate the effectiveness and superiority of our approach over 12 representative PCQA methods, which demonstrate the potential of multi-modal learning, the importance of latent feature enhancement, and the significance of graph-based fusion in advancing the field of PCQA. © 2024 IEEE.},
keywords = {3D reconstruction, Cross-modal, Language Model, Large language model, Learning approach, Multi-modal, Multimodal quality assessment, Point cloud quality assessment, Point-clouds, Quality assessment},
pubstate = {published},
tppubtype = {article}
}