mali6
/

autocap

Model card Files Files and versions

xet

Community

mali6 commited on Jun 24, 2024

Commit

3555c0a

verified ·

1 Parent(s): 720b7a8

Upload genau-full-l.yaml with huggingface_hub

Browse files

Files changed (1) hide show

genau-full-l.yaml +12 -27

genau-full-l.yaml CHANGED Viewed

@@ -4,14 +4,14 @@ training:
   nodes_count: -1
 logging:
-  project_name: "audioldm-snap"
-  wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
-  log_directory: "./log/latent_diffusion"
-  # Saving Checkpoints
-  # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
-  S3_BUCKET: "snap-genvid"
-  S3_FOLDER: 'mali6/audioldm'
   save_checkpoint_every_n_steps: 1500
   save_top_k: -1
@@ -31,10 +31,10 @@ variables:
   batch_size: &bs 20 # TODO: change to 256
 data:
-  metadata_root: "/fsx/mali6/datasets/metadata/dataset_root.json"
   train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
-  val: "audiocaps"
-  test: "audiocaps"
   class_label_indices: "audioset_eval_subset"
   dataloader_add_ons: []
   augment_p : 0.0
@@ -49,6 +49,7 @@ data:
       - caption
       - best_model_w_meta_pred_caption
       - gt_audio_caption
       - wavcaps_caption
     tags:
       - keywords
@@ -89,17 +90,6 @@ model:
   params:
     # dataset token
     dataset_embed_dim: 32
-    dataset2id:
-      audiocaps: 0
-      clotho: 1
-      vggsounds: 2
-      wavcaps_audioset_strong: 3
-      wavcaps_bbcsound: 4
-      wavcaps_freesound: 5
-      wavcaps_soundbible: 6
-      fsd50k: 7
-      caption_audioset: 8
     # logging
     validate_uncond: False
@@ -214,16 +204,10 @@ model:
         # The type of positional encodings to use for the time input
         time_pe_type: learned
-        # Uses a label that specifies whether the current input is a video or an image
-        use_video_image_conditioning: False
-        # Uses a label that specifies the framerate of the current video
-        use_framerate_conditioning: False
         # Uses a label that specifies the id of the dataset from which the current input comes
         use_dataset_id_conditioning: True
         # Uses a label that specifies the resolution of the current input
         use_resolution_conditioning: False
-        # If True uses the unmasked parts of the denoised input as conditioning
-        use_denoised_input_conditioning: False
         # Size of the input in pixels
         input_size: [1, *latent_t_size, *latent_f_size]  # (frames_count, height, widht)
@@ -348,6 +332,7 @@ model:
             wavcaps_soundbible: 6
             fsd50k: 7
             caption_audioset: 8
             unconditional: 0 # set the uncondtional to 0 for future experiments

   nodes_count: -1
 logging:
+  project_name: "genau"
+  wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
+  log_directory: "./run_logs/genau/train"
+  # (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
+  # S3_BUCKET: "YOUR_S3_BUCKET"
+  # S3_FOLDER: 'YOUR_S3_FOLDER'
   save_checkpoint_every_n_steps: 1500
   save_top_k: -1
   batch_size: &bs 20 # TODO: change to 256
 data:
+  metadata_root: "../dataset_preperation/data/metadata/dataset_root.json"
   train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
+  val: "audioset"
+  test: "audioset"
   class_label_indices: "audioset_eval_subset"
   dataloader_add_ons: []
   augment_p : 0.0
       - caption
       - best_model_w_meta_pred_caption
       - gt_audio_caption
+      - autocap_caption
       - wavcaps_caption
     tags:
       - keywords
   params:
     # dataset token
     dataset_embed_dim: 32
     # logging
     validate_uncond: False
         # The type of positional encodings to use for the time input
         time_pe_type: learned
         # Uses a label that specifies the id of the dataset from which the current input comes
         use_dataset_id_conditioning: True
         # Uses a label that specifies the resolution of the current input
         use_resolution_conditioning: False
         # Size of the input in pixels
         input_size: [1, *latent_t_size, *latent_f_size]  # (frames_count, height, widht)
             wavcaps_soundbible: 6
             fsd50k: 7
             caption_audioset: 8
+            autocap: 9
             unconditional: 0 # set the uncondtional to 0 for future experiments