Upload genau-full-l.yaml with huggingface_hub
Browse files- genau-full-l.yaml +12 -27
genau-full-l.yaml
CHANGED
|
@@ -4,14 +4,14 @@ training:
|
|
| 4 |
nodes_count: -1
|
| 5 |
|
| 6 |
logging:
|
| 7 |
-
project_name: "
|
| 8 |
-
wandb_key:
|
| 9 |
-
log_directory: "./
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
# Saving Checkpoints
|
| 12 |
-
# if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
| 13 |
-
S3_BUCKET: "snap-genvid"
|
| 14 |
-
S3_FOLDER: 'mali6/audioldm'
|
| 15 |
save_checkpoint_every_n_steps: 1500
|
| 16 |
save_top_k: -1
|
| 17 |
|
|
@@ -31,10 +31,10 @@ variables:
|
|
| 31 |
batch_size: &bs 20 # TODO: change to 256
|
| 32 |
|
| 33 |
data:
|
| 34 |
-
metadata_root: "/
|
| 35 |
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
| 36 |
-
val: "
|
| 37 |
-
test: "
|
| 38 |
class_label_indices: "audioset_eval_subset"
|
| 39 |
dataloader_add_ons: []
|
| 40 |
augment_p : 0.0
|
|
@@ -49,6 +49,7 @@ data:
|
|
| 49 |
- caption
|
| 50 |
- best_model_w_meta_pred_caption
|
| 51 |
- gt_audio_caption
|
|
|
|
| 52 |
- wavcaps_caption
|
| 53 |
tags:
|
| 54 |
- keywords
|
|
@@ -89,17 +90,6 @@ model:
|
|
| 89 |
params:
|
| 90 |
# dataset token
|
| 91 |
dataset_embed_dim: 32
|
| 92 |
-
dataset2id:
|
| 93 |
-
audiocaps: 0
|
| 94 |
-
clotho: 1
|
| 95 |
-
vggsounds: 2
|
| 96 |
-
wavcaps_audioset_strong: 3
|
| 97 |
-
wavcaps_bbcsound: 4
|
| 98 |
-
wavcaps_freesound: 5
|
| 99 |
-
wavcaps_soundbible: 6
|
| 100 |
-
fsd50k: 7
|
| 101 |
-
caption_audioset: 8
|
| 102 |
-
|
| 103 |
|
| 104 |
# logging
|
| 105 |
validate_uncond: False
|
|
@@ -214,16 +204,10 @@ model:
|
|
| 214 |
|
| 215 |
# The type of positional encodings to use for the time input
|
| 216 |
time_pe_type: learned
|
| 217 |
-
# Uses a label that specifies whether the current input is a video or an image
|
| 218 |
-
use_video_image_conditioning: False
|
| 219 |
-
# Uses a label that specifies the framerate of the current video
|
| 220 |
-
use_framerate_conditioning: False
|
| 221 |
# Uses a label that specifies the id of the dataset from which the current input comes
|
| 222 |
use_dataset_id_conditioning: True
|
| 223 |
# Uses a label that specifies the resolution of the current input
|
| 224 |
use_resolution_conditioning: False
|
| 225 |
-
# If True uses the unmasked parts of the denoised input as conditioning
|
| 226 |
-
use_denoised_input_conditioning: False
|
| 227 |
|
| 228 |
# Size of the input in pixels
|
| 229 |
input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
|
|
@@ -348,6 +332,7 @@ model:
|
|
| 348 |
wavcaps_soundbible: 6
|
| 349 |
fsd50k: 7
|
| 350 |
caption_audioset: 8
|
|
|
|
| 351 |
unconditional: 0 # set the uncondtional to 0 for future experiments
|
| 352 |
|
| 353 |
|
|
|
|
| 4 |
nodes_count: -1
|
| 5 |
|
| 6 |
logging:
|
| 7 |
+
project_name: "genau"
|
| 8 |
+
wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
|
| 9 |
+
log_directory: "./run_logs/genau/train"
|
| 10 |
+
|
| 11 |
+
# (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
| 12 |
+
# S3_BUCKET: "YOUR_S3_BUCKET"
|
| 13 |
+
# S3_FOLDER: 'YOUR_S3_FOLDER'
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
save_checkpoint_every_n_steps: 1500
|
| 16 |
save_top_k: -1
|
| 17 |
|
|
|
|
| 31 |
batch_size: &bs 20 # TODO: change to 256
|
| 32 |
|
| 33 |
data:
|
| 34 |
+
metadata_root: "../dataset_preperation/data/metadata/dataset_root.json"
|
| 35 |
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
| 36 |
+
val: "audioset"
|
| 37 |
+
test: "audioset"
|
| 38 |
class_label_indices: "audioset_eval_subset"
|
| 39 |
dataloader_add_ons: []
|
| 40 |
augment_p : 0.0
|
|
|
|
| 49 |
- caption
|
| 50 |
- best_model_w_meta_pred_caption
|
| 51 |
- gt_audio_caption
|
| 52 |
+
- autocap_caption
|
| 53 |
- wavcaps_caption
|
| 54 |
tags:
|
| 55 |
- keywords
|
|
|
|
| 90 |
params:
|
| 91 |
# dataset token
|
| 92 |
dataset_embed_dim: 32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# logging
|
| 95 |
validate_uncond: False
|
|
|
|
| 204 |
|
| 205 |
# The type of positional encodings to use for the time input
|
| 206 |
time_pe_type: learned
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
# Uses a label that specifies the id of the dataset from which the current input comes
|
| 208 |
use_dataset_id_conditioning: True
|
| 209 |
# Uses a label that specifies the resolution of the current input
|
| 210 |
use_resolution_conditioning: False
|
|
|
|
|
|
|
| 211 |
|
| 212 |
# Size of the input in pixels
|
| 213 |
input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
|
|
|
|
| 332 |
wavcaps_soundbible: 6
|
| 333 |
fsd50k: 7
|
| 334 |
caption_audioset: 8
|
| 335 |
+
autocap: 9
|
| 336 |
unconditional: 0 # set the uncondtional to 0 for future experiments
|
| 337 |
|
| 338 |
|