@echo off

:: Set up directory variables first
set "SCRIPT_DIR=%~dp0"
set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
set "CURRENT_DIR=%CD%"
cd /d "%SCRIPT_DIR%"

:: EZ-Tokenizer Launcher with Banner
:: This script must be run as administrator
:: Previous versions were known as NexForge Tokenizer
:: All functionality remains the same, only the name has been updated

cls

echo.
echo =======================================================
echo                EZ-TOKENIZER v1.0.0
echo          (CodeGen-NF Model Pre-Release)
echo =======================================================
echo Script running from: %SCRIPT_DIR%

:check_admin
net session >nul 2>&1
if %errorLevel% == 0 (
    echo Running with administrator privileges...
) else (
    echo ###########################################################
    echo #                                                         #
    echo #  EZ-TOKENIZER REQUIRES ADMINISTRATOR PRIVILEGES        #
    echo #  Please right-click and select 'Run as administrator'   #
    echo #                                                         #
    echo ###########################################################
    echo.
    echo Please right-click on this file and select "Run as administrator"
    pause
    exit /b
)

:menu
cls
:: Display banner
echo   N   N  EEEEE  X   X  FFFFF  OOOOO  RRRR   GGGG  EEEEE
echo   NN  N  E       X X   F      O   O  R   R  G      E     
echo   N N N  EEEE     X    FFFF   O   O  RRRR   G  GG  EEEE 
echo   N  NN  E       X X   F      O   O  R  R   G   G  E     
echo   N   N  EEEEE  X   X  F      OOOOO  R   R   GGGG  EEEEE
echo.
echo   PRESENTS:
echo =======================================================
echo                 EZ-TOKENIZER v1.0.0
echo =======================================================
:: Display current directory with error checking
if defined SCRIPT_DIR (
    echo Current directory: %~dp0
    echo Script directory: %~dp0
) else (
    echo [WARNING] SCRIPT_DIR not defined. Using current directory: %CD%
    set "SCRIPT_DIR=%CD%"
)
echo.
echo MINIMUM REQUIREMENTS:
echo - Python 3.8 or higher
echo - 4GB RAM minimum (8GB+ recommended)
echo - 1GB free disk space

echo.
echo DATASET INFORMATION:
echo - Dataset location: %SCRIPT_DIR%\Dataset\
echo - Please add your dataset files to the directory or use 4. Open Dataset Directory and insert your files.

echo.
echo MENU:
echo 1. Install Dependencies
echo 2. Create Tokenizer (50k vocab, min_freq=2)
echo 3. Test Tokenizer (2 runs with 10,000 samples)
echo 4. Open Dataset Directory
echo 5. Exit
echo.
set /p choice=Enter your choice (1-5): 

echo.

if "%choice%"=="1" goto install_deps
if "%choice%"=="2" goto create_tokenizer
if "%choice%"=="3" goto test_tokenizer
if "%choice%"=="4" goto open_dataset
if "%choice%"=="5" goto exit

echo Invalid choice. Please enter a number between 1 and 5.
pause
goto menu

:install_deps
echo Installing dependencies...
echo This may take a few minutes...
echo.

:: Create virtual environment if it doesn't exist
if not exist "%SCRIPT_DIR%\venv" (
    echo Creating virtual environment...
    python -m venv "%SCRIPT_DIR%\venv"
    if errorlevel 1 (
        echo Failed to create virtual environment
        pause
        goto menu
    )
)

:: Activate virtual environment and install dependencies
call "%SCRIPT_DIR%\venv\Scripts\activate"

:: Upgrade pip first
echo [INFO] Upgrading pip...
python -m pip install --upgrade pip
if errorlevel 1 (
    echo [ERROR] Failed to upgrade pip
    pause
    goto menu
)

:: Install PyTorch CPU version
echo [INFO] Installing PyTorch CPU version...
pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
if errorlevel 1 (
    echo [WARNING] Failed to install specific PyTorch version, trying latest compatible version...
    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
    if errorlevel 1 (
        echo [ERROR] Failed to install PyTorch
        echo [INFO] Please check your internet connection and try again
        pause
        goto menu
    )
)

:: Install other dependencies one by one
echo [INFO] Installing additional dependencies...
pip install tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
if errorlevel 1 (
    echo [WARNING] Failed to install some dependencies, trying with --no-cache-dir...
    pip install --no-cache-dir tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
    if errorlevel 1 (
        echo [ERROR] Failed to install additional dependencies
        pause
        goto menu
    )
)

:: Install tokenizers with pre-built wheel
echo [INFO] Installing tokenizers...
pip install tokenizers==0.21.1 --only-binary :all:
if errorlevel 1 (
    echo [WARNING] Could not install tokenizers with pre-built wheel
    echo [INFO] Trying alternative installation method...
    pip install tokenizers==0.21.1 --no-deps
    if errorlevel 1 (
        echo [ERROR] Failed to install tokenizers
        echo Note: This package requires a C++ build toolchain or a pre-built wheel.
        echo On Windows, you may need to install Visual Studio Build Tools with C++ workload.
        pause
        goto menu
    )
)

echo.
echo [INFO] All dependencies installed successfully!

echo [INFO] Installing nexforgetokenizer in development mode...
python -m pip install -e .
if errorlevel 1 (
    echo [ERROR] Failed to install nexforgetokenizer in development mode
    pause
    goto menu
)

echo [INFO] Package installation complete!
pause
goto menu

:create_tokenizer
if not exist "%SCRIPT_DIR%\venv" (
    echo Virtual environment not found. Please install dependencies first.
    pause
    goto menu
)

call "%SCRIPT_DIR%\venv\Scripts\activate"

:: Create output directory if it doesn't exist
if not exist "%SCRIPT_DIR%\output" mkdir "%SCRIPT_DIR%\output"

:: Check if dataset directory exists
if not exist "%SCRIPT_DIR%\Dataset" (
    echo Creating Dataset directory...
    mkdir "%SCRIPT_DIR%\Dataset"
    echo Please add your dataset files to: %SCRIPT_DIR%\Dataset
    pause
    start "" "%SCRIPT_DIR%\Dataset"
    goto menu
)

:: Check if there are any files in the Dataset directory
dir /b "%SCRIPT_DIR%\Dataset\*.*" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo No files found in: %SCRIPT_DIR%\Dataset
    echo Please add your dataset files to this directory.
    pause
    start "" "%SCRIPT_DIR%\Dataset"
    goto menu
)

echo Creating EZ-Tokenizer with 50k vocabulary and min_freq=2 (all files)...
python -m nexforgetokenizer.adaptive_tokenizer "%SCRIPT_DIR%\Dataset" "%SCRIPT_DIR%\output\tokenizer.json" 50000 2 MAX

if errorlevel 1 (
    echo Failed to create tokenizer
    pause
    goto menu
)

echo.
echo EZ-Tokenizer created successfully at: %SCRIPT_DIR%\output\tokenizer.json
echo Vocabulary size: 50,000
echo Minimum frequency: 2
echo Processed all available files in the dataset
echo.
echo You can now use this tokenizer in your projects by loading: output\tokenizer.json
pause
goto menu

:test_tokenizer
if not exist "%SCRIPT_DIR%\venv" (
    echo Virtual environment not found. Please install dependencies first.
    pause
    goto menu
)

call "%SCRIPT_DIR%\venv\Scripts\activate"

:: Create test_result directory if it doesn't exist
if not exist "%SCRIPT_DIR%\test_result" mkdir "%SCRIPT_DIR%\test_result"

:: Check if tokenizer exists
if not exist "%SCRIPT_DIR%\output\tokenizer.json" (
    echo EZ-Tokenizer not found. Please create a tokenizer first.
    echo Looking for: %SCRIPT_DIR%\output\tokenizer.json
    pause
    goto menu
)

echo Running test with 10,000 samples...
echo Testing EZ-Tokenizer with 10,000 samples...
python "%SCRIPT_DIR%\Test_tokenizer\test_tokenizer.py" --tokenizer "%SCRIPT_DIR%\output\tokenizer.json" --input "%SCRIPT_DIR%\Dataset" --sample 10000 --output "%SCRIPT_DIR%\test_result\test_run.txt"

if errorlevel 1 (
    echo Test run failed
    pause
    goto menu
)

echo.
echo Both test runs completed successfully!
echo Results saved to: %SCRIPT_DIR%\test_result\

:: Open the test results directory
if exist "%SCRIPT_DIR%\test_result\" (
    start "" "%SCRIPT_DIR%\test_result\"
) else (
    echo Warning: Test results directory not found.
)

pause
goto menu

:open_dataset
if not exist "%SCRIPT_DIR%\Dataset" (
    mkdir "%SCRIPT_DIR%\Dataset"
)
start "" "%SCRIPT_DIR%\Dataset"
goto menu

:exit
cd /d "%CURRENT_DIR%"
echo Exiting NexForge Tokenizer Manager...
timeout /t 2 >nul
exit