Upload Qwen2VLForConditionalGeneration

#5
by albertvillanova HF Staff - opened
config.json CHANGED
@@ -20,9 +20,8 @@
20
  "rms_norm_eps": 1e-06,
21
  "rope_scaling": {
22
  "mrope_section": [
23
- 16,
24
- 24,
25
- 24
26
  ],
27
  "rope_type": "default",
28
  "type": "default"
@@ -39,6 +38,7 @@
39
  "eos_token_id": 151645,
40
  "hidden_act": "silu",
41
  "hidden_size": 16,
 
42
  "initializer_range": 0.02,
43
  "intermediate_size": 8960,
44
  "layer_types": [
@@ -65,10 +65,13 @@
65
  "tie_word_embeddings": true,
66
  "use_cache": true,
67
  "use_sliding_window": false,
 
 
 
68
  "vision_token_id": 151654,
69
  "vocab_size": 151936
70
  },
71
- "transformers_version": "4.57.5",
72
  "use_cache": true,
73
  "use_sliding_window": false,
74
  "video_token_id": 151656,
@@ -82,8 +85,7 @@
82
  "initializer_range": 0.02,
83
  "mlp_ratio": 4,
84
  "model_type": "qwen2_vl",
85
- "num_attention_heads": 4,
86
- "num_heads": 16,
87
  "num_hidden_layers": 2,
88
  "num_key_value_heads": 2,
89
  "patch_size": 14,
 
20
  "rms_norm_eps": 1e-06,
21
  "rope_scaling": {
22
  "mrope_section": [
23
+ 1,
24
+ 1
 
25
  ],
26
  "rope_type": "default",
27
  "type": "default"
 
38
  "eos_token_id": 151645,
39
  "hidden_act": "silu",
40
  "hidden_size": 16,
41
+ "image_token_id": null,
42
  "initializer_range": 0.02,
43
  "intermediate_size": 8960,
44
  "layer_types": [
 
65
  "tie_word_embeddings": true,
66
  "use_cache": true,
67
  "use_sliding_window": false,
68
+ "video_token_id": null,
69
+ "vision_end_token_id": 151653,
70
+ "vision_start_token_id": 151652,
71
  "vision_token_id": 151654,
72
  "vocab_size": 151936
73
  },
74
+ "transformers_version": "4.56.2",
75
  "use_cache": true,
76
  "use_sliding_window": false,
77
  "video_token_id": 151656,
 
85
  "initializer_range": 0.02,
86
  "mlp_ratio": 4,
87
  "model_type": "qwen2_vl",
88
+ "num_heads": 4,
 
89
  "num_hidden_layers": 2,
90
  "num_key_value_heads": 2,
91
  "patch_size": 14,
generation_config.json CHANGED
@@ -9,5 +9,5 @@
9
  "temperature": 0.01,
10
  "top_k": 1,
11
  "top_p": 0.001,
12
- "transformers_version": "4.57.5"
13
  }
 
9
  "temperature": 0.01,
10
  "top_k": 1,
11
  "top_p": 0.001,
12
+ "transformers_version": "4.56.2"
13
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b17688b7042bd32913d5970b53043b8481f071e73ab15e4da04895eb80930c3e
3
  size 7081872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71a6a57d13133e42a74dfe4323c7dbb53374b97eb0617dee972d2346ab4e847a
3
  size 7081872
preprocessor_config.json CHANGED
@@ -7,7 +7,6 @@
7
  "do_center_crop": null,
8
  "do_convert_rgb": true,
9
  "do_normalize": true,
10
- "do_pad": null,
11
  "do_rescale": true,
12
  "do_resize": true,
13
  "image_mean": [
@@ -25,7 +24,6 @@
25
  "max_pixels": 12845056,
26
  "merge_size": 2,
27
  "min_pixels": 3136,
28
- "pad_size": null,
29
  "patch_size": 14,
30
  "processor_class": "Qwen2VLProcessor",
31
  "resample": 3,
 
7
  "do_center_crop": null,
8
  "do_convert_rgb": true,
9
  "do_normalize": true,
 
10
  "do_rescale": true,
11
  "do_resize": true,
12
  "image_mean": [
 
24
  "max_pixels": 12845056,
25
  "merge_size": 2,
26
  "min_pixels": 3136,
 
27
  "patch_size": 14,
28
  "processor_class": "Qwen2VLProcessor",
29
  "resample": 3,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
3
- size 11420371
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a3a6fcb80132f76da8aa40cdc3fccd7e5d8468ef15421f5b0c2715e85217d2
3
+ size 11420538
video_preprocessor_config.json CHANGED
@@ -6,6 +6,7 @@
6
  "do_center_crop": null,
7
  "do_convert_rgb": true,
8
  "do_normalize": true,
 
9
  "do_rescale": true,
10
  "do_resize": true,
11
  "do_sample_frames": false,
@@ -27,7 +28,6 @@
27
  "min_frames": 4,
28
  "min_pixels": 3136,
29
  "num_frames": null,
30
- "pad_size": null,
31
  "patch_size": 14,
32
  "processor_class": "Qwen2VLProcessor",
33
  "resample": 3,
@@ -37,6 +37,7 @@
37
  "longest_edge": 12845056,
38
  "shortest_edge": 3136
39
  },
 
40
  "temporal_patch_size": 2,
41
  "video_metadata": null,
42
  "video_processor_type": "Qwen2VLVideoProcessor"
 
6
  "do_center_crop": null,
7
  "do_convert_rgb": true,
8
  "do_normalize": true,
9
+ "do_pad": null,
10
  "do_rescale": true,
11
  "do_resize": true,
12
  "do_sample_frames": false,
 
28
  "min_frames": 4,
29
  "min_pixels": 3136,
30
  "num_frames": null,
 
31
  "patch_size": 14,
32
  "processor_class": "Qwen2VLProcessor",
33
  "resample": 3,
 
37
  "longest_edge": 12845056,
38
  "shortest_edge": 3136
39
  },
40
+ "size_divisor": null,
41
  "temporal_patch_size": 2,
42
  "video_metadata": null,
43
  "video_processor_type": "Qwen2VLVideoProcessor"