SNIFFER is a multimodal large language model specifically engineered for Out-Of-Context misinformation detection and explanation.
It employs two-stage instruction tuning on InstructBLIP, including news-domain alignment and task-specific tuning.
The whole model is composed of three parts: 1) internal checking that analyzes the consistency of the image and text content; 2) external checking that analyzes the relevance between the context of the retrieved image and the provided text, and 3) composed reasoning that combines the two-pronged analysis to arrive at a final judgment and explanation.
Here the checkpoint is used for the internal checking part.
@inproceedings{qi2023sniffer,
author = {Qi, Peng and Yan, Zehong and Hsu, Wynne and Lee, Mong Li},
title = {SNIFFER: Multimodal Large Language Model for Explainable Out-of-Context Misinformation Detection},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
year = {2024}
}