🏆 LLM4SE Leaderboard

Community-Driven Evaluation of Top Large Language Models (LLMs) in Software Engineering (SE) Tasks

The SWE-Chatbot-Arena is an open-source platform designed to evaluate LLMs through human preference, fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the performance of leading LLMs in related tasks. For technical details, check out our paper.

{
  • "headers": [
    • "Rank",
    • "Model",
    • "Organization",
    • "Elo Score",
    • "Win Rate",
    • "Conversation Efficiency Index",
    • "Conversation Consistency Index",
    • "Bradley-Terry Coefficient",
    • "Eigenvector Centrality Value",
    • "Newman Modularity Score",
    • "PageRank Score"
    ],
  • "data": [
    • [
      • 1,
      • "Voxtral Small 24B 2507",
      • "Mistral",
      • 1003.98,
      • 1,
      • 1,
      • null,
      • 0,
      • 1,
      • 1,
      • 0.12
      ],
    • [
      • 2,
      • "o3",
      • "OpenAI",
      • 1002.01,
      • 1,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.05
      ],
    • [
      • 3,
      • "Claude 3.5 Sonnet",
      • "Anthropic",
      • 1002,
      • 1,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.04
      ],
    • [
      • 3,
      • "Grok 3 Mini",
      • "xAI",
      • 1002,
      • 1,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.04
      ],
    • [
      • 3,
      • "Hermes 2 Pro - Llama-3 8B",
      • "NousResearch",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 3,
      • "Mistral 7B Instruct",
      • "Mistral",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 3,
      • "GLM 4.5 Air",
      • "Z.AI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 3,
      • "Llemma 7b",
      • "EleutherAI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 3,
      • "GLM 4 32B",
      • "Z.AI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 3,
      • "DeepSeek V3.1",
      • "DeepSeek",
      • 1002,
      • 1,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 3,
      • "ERNIE 4.5 VL 28B A3B",
      • "Baidu",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 3,
      • "Qwen3 VL 30B A3B Thinking",
      • "Qwen",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 13,
      • "Ministral 3 3B 2512",
      • "Mistral",
      • 1001.99,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 14,
      • "Gemini 2.5 Pro Preview 06-05",
      • "",
      • 1000,
      • 0.67,
      • null,
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 15,
      • "GPT-4o",
      • "OpenAI",
      • 999.99,
      • 0.5,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 16,
      • "Gemma 3 27B",
      • "Google",
      • 998,
      • 0.5,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 16,
      • "Claude 3.7 Sonnet",
      • "Anthropic",
      • 998,
      • 0,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Gemini 2.5 Pro",
      • "Google",
      • 998,
      • 0,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Qwen3 30B A3B",
      • "Qwen",
      • 998,
      • 0,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Llama 3 Euryale 70B v2.1",
      • "",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Grok Code Fast 1",
      • "xAI",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Qwen3 8B",
      • "Qwen",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Mistral Large",
      • "Mistral",
      • 998,
      • 0,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Grok 3",
      • "xAI",
      • 998,
      • 0,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "GPT-5 Mini",
      • "OpenAI",
      • 998,
      • 0,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Qwen3 235B A22B",
      • "Qwen",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Command R7B (12-2024)",
      • "Cohere",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Gemma 2 27B",
      • "Google",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Qwen3 Coder Flash",
      • "Qwen",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "o3 Mini High",
      • "OpenAI",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Trinity Mini",
      • "Arcee AI",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Nemotron Nano 9B V2",
      • "NVIDIA",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 33,
      • "gpt-oss-120b",
      • "OpenAI",
      • 997.98,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 34,
      • "Claude 3.7 Sonnet (thinking)",
      • "Anthropic",
      • 996.02,
      • 0,
      • null,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 34,
      • "LFM2-2.6B",
      • "LiquidAI",
      • 996.02,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 36,
      • "GPT-4o-mini",
      • "OpenAI",
      • 996.01,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 37,
      • "Qwen-Max",
      • "Qwen",
      • 996,
      • 0,
      • null,
      • 1,
      • 0,
      • 0,
      • 0,
      • 0.02
      ]
    ],
  • "metadata": null
}

Made with ❤️ for SWE-Chatbot-Arena. If this work is useful to you, please consider citing our vision paper:

@inproceedings{zhao2025se,
title={SE Arena: An Interactive Platform for Evaluating Foundation Models in Software Engineering},
author={Zhao, Zhimin},
booktitle={2025 IEEE/ACM Second International Conference on AI Foundation Models and Software Engineering (Forge)},
pages={78--81},
year={2025},
organization={IEEE}
}