@echo off :: Set up directory variables first set "SCRIPT_DIR=%~dp0" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" set "CURRENT_DIR=%CD%" cd /d "%SCRIPT_DIR%" :: EZ-Tokenizer Launcher with Banner :: This script must be run as administrator :: Previous versions were known as NexForge Tokenizer :: All functionality remains the same, only the name has been updated cls echo. echo ======================================================= echo EZ-TOKENIZER v1.0.0 echo (CodeGen-NF Model Pre-Release) echo ======================================================= echo Script running from: %SCRIPT_DIR% :check_admin net session >nul 2>&1 if %errorLevel% == 0 ( echo Running with administrator privileges... ) else ( echo ########################################################### echo # # echo # EZ-TOKENIZER REQUIRES ADMINISTRATOR PRIVILEGES # echo # Please right-click and select 'Run as administrator' # echo # # echo ########################################################### echo. echo Please right-click on this file and select "Run as administrator" pause exit /b ) :menu cls :: Display banner echo N N EEEEE X X FFFFF OOOOO RRRR GGGG EEEEE echo NN N E X X F O O R R G E echo N N N EEEE X FFFF O O RRRR G GG EEEE echo N NN E X X F O O R R G G E echo N N EEEEE X X F OOOOO R R GGGG EEEEE echo. echo PRESENTS: echo ======================================================= echo EZ-TOKENIZER v1.0.0 echo ======================================================= :: Display current directory with error checking if defined SCRIPT_DIR ( echo Current directory: %~dp0 echo Script directory: %~dp0 ) else ( echo [WARNING] SCRIPT_DIR not defined. Using current directory: %CD% set "SCRIPT_DIR=%CD%" ) echo. echo MINIMUM REQUIREMENTS: echo - Python 3.8 or higher echo - 4GB RAM minimum (8GB+ recommended) echo - 1GB free disk space echo. echo DATASET INFORMATION: echo - Dataset location: %SCRIPT_DIR%\Dataset\ echo - Please add your dataset files to the directory or use 4. Open Dataset Directory and insert your files. echo. echo MENU: echo 1. Install Dependencies echo 2. Create Tokenizer (50k vocab, min_freq=2) echo 3. Test Tokenizer (2 runs with 10,000 samples) echo 4. Open Dataset Directory echo 5. Exit echo. set /p choice=Enter your choice (1-5): echo. if "%choice%"=="1" goto install_deps if "%choice%"=="2" goto create_tokenizer if "%choice%"=="3" goto test_tokenizer if "%choice%"=="4" goto open_dataset if "%choice%"=="5" goto exit echo Invalid choice. Please enter a number between 1 and 5. pause goto menu :install_deps echo Installing dependencies... echo This may take a few minutes... echo. :: Create virtual environment if it doesn't exist if not exist "%SCRIPT_DIR%\venv" ( echo Creating virtual environment... python -m venv "%SCRIPT_DIR%\venv" if errorlevel 1 ( echo Failed to create virtual environment pause goto menu ) ) :: Activate virtual environment and install dependencies call "%SCRIPT_DIR%\venv\Scripts\activate" :: Upgrade pip first echo [INFO] Upgrading pip... python -m pip install --upgrade pip if errorlevel 1 ( echo [ERROR] Failed to upgrade pip pause goto menu ) :: Install PyTorch CPU version echo [INFO] Installing PyTorch CPU version... pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu if errorlevel 1 ( echo [WARNING] Failed to install specific PyTorch version, trying latest compatible version... pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu if errorlevel 1 ( echo [ERROR] Failed to install PyTorch echo [INFO] Please check your internet connection and try again pause goto menu ) ) :: Install other dependencies one by one echo [INFO] Installing additional dependencies... pip install tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein if errorlevel 1 ( echo [WARNING] Failed to install some dependencies, trying with --no-cache-dir... pip install --no-cache-dir tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein if errorlevel 1 ( echo [ERROR] Failed to install additional dependencies pause goto menu ) ) :: Install tokenizers with pre-built wheel echo [INFO] Installing tokenizers... pip install tokenizers==0.21.1 --only-binary :all: if errorlevel 1 ( echo [WARNING] Could not install tokenizers with pre-built wheel echo [INFO] Trying alternative installation method... pip install tokenizers==0.21.1 --no-deps if errorlevel 1 ( echo [ERROR] Failed to install tokenizers echo Note: This package requires a C++ build toolchain or a pre-built wheel. echo On Windows, you may need to install Visual Studio Build Tools with C++ workload. pause goto menu ) ) echo. echo [INFO] All dependencies installed successfully! echo [INFO] Installing nexforgetokenizer in development mode... python -m pip install -e . if errorlevel 1 ( echo [ERROR] Failed to install nexforgetokenizer in development mode pause goto menu ) echo [INFO] Package installation complete! pause goto menu :create_tokenizer if not exist "%SCRIPT_DIR%\venv" ( echo Virtual environment not found. Please install dependencies first. pause goto menu ) call "%SCRIPT_DIR%\venv\Scripts\activate" :: Create output directory if it doesn't exist if not exist "%SCRIPT_DIR%\output" mkdir "%SCRIPT_DIR%\output" :: Check if dataset directory exists if not exist "%SCRIPT_DIR%\Dataset" ( echo Creating Dataset directory... mkdir "%SCRIPT_DIR%\Dataset" echo Please add your dataset files to: %SCRIPT_DIR%\Dataset pause start "" "%SCRIPT_DIR%\Dataset" goto menu ) :: Check if there are any files in the Dataset directory dir /b "%SCRIPT_DIR%\Dataset\*.*" >nul 2>&1 if %ERRORLEVEL% NEQ 0 ( echo No files found in: %SCRIPT_DIR%\Dataset echo Please add your dataset files to this directory. pause start "" "%SCRIPT_DIR%\Dataset" goto menu ) echo Creating EZ-Tokenizer with 50k vocabulary and min_freq=2 (all files)... python -m nexforgetokenizer.adaptive_tokenizer "%SCRIPT_DIR%\Dataset" "%SCRIPT_DIR%\output\tokenizer.json" 50000 2 MAX if errorlevel 1 ( echo Failed to create tokenizer pause goto menu ) echo. echo EZ-Tokenizer created successfully at: %SCRIPT_DIR%\output\tokenizer.json echo Vocabulary size: 50,000 echo Minimum frequency: 2 echo Processed all available files in the dataset echo. echo You can now use this tokenizer in your projects by loading: output\tokenizer.json pause goto menu :test_tokenizer if not exist "%SCRIPT_DIR%\venv" ( echo Virtual environment not found. Please install dependencies first. pause goto menu ) call "%SCRIPT_DIR%\venv\Scripts\activate" :: Create test_result directory if it doesn't exist if not exist "%SCRIPT_DIR%\test_result" mkdir "%SCRIPT_DIR%\test_result" :: Check if tokenizer exists if not exist "%SCRIPT_DIR%\output\tokenizer.json" ( echo EZ-Tokenizer not found. Please create a tokenizer first. echo Looking for: %SCRIPT_DIR%\output\tokenizer.json pause goto menu ) echo Running test with 10,000 samples... echo Testing EZ-Tokenizer with 10,000 samples... python "%SCRIPT_DIR%\Test_tokenizer\test_tokenizer.py" --tokenizer "%SCRIPT_DIR%\output\tokenizer.json" --input "%SCRIPT_DIR%\Dataset" --sample 10000 --output "%SCRIPT_DIR%\test_result\test_run.txt" if errorlevel 1 ( echo Test run failed pause goto menu ) echo. echo Both test runs completed successfully! echo Results saved to: %SCRIPT_DIR%\test_result\ :: Open the test results directory if exist "%SCRIPT_DIR%\test_result\" ( start "" "%SCRIPT_DIR%\test_result\" ) else ( echo Warning: Test results directory not found. ) pause goto menu :open_dataset if not exist "%SCRIPT_DIR%\Dataset" ( mkdir "%SCRIPT_DIR%\Dataset" ) start "" "%SCRIPT_DIR%\Dataset" goto menu :exit cd /d "%CURRENT_DIR%" echo Exiting NexForge Tokenizer Manager... timeout /t 2 >nul exit