cambrian-1.github.io/bibliography.bib at main · cambrian-mllm/cambrian-1.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{power2022grokking,
  title={Grokking: Generalization beyond overfitting on small algorithmic datasets},
  author={Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
  journal={arXiv preprint arXiv:2201.02177},
  year={2022}
}


@article{wang2023see,
  title={To see is to believe: Prompting gpt-4v for better visual instruction tuning},
  author={Wang, Junke and Meng, Lingchen and Weng, Zejia and He, Bo and Wu, Zuxuan and Jiang, Yu-Gang},
  journal={arXiv preprint arXiv:2311.07574},
  year={2023}
}

@article{zhang2023llavar,
  title={Llavar: Enhanced visual instruction tuning for text-rich image understanding},
  author={Zhang, Yanzhe and Zhang, Ruiyi and Gu, Jiuxiang and Zhou, Yufan and Lipka, Nedim and Yang, Diyi and Sun, Tong},
  journal={arXiv preprint arXiv:2306.17107},
  year={2023}
}
@inproceedings{liu2022convnet,
  title={A convnet for the 2020s},
  author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
  booktitle={CVPR},
  year={2022}
}
@article{masry2022chartqa,
  title={Chartqa: A benchmark for question answering about charts with visual and logical reasoning},
  author={Masry, Ahmed and Long, Do Xuan and Tan, Jia Qing and Joty, Shafiq and Hoque, Enamul},
  journal={arXiv preprint arXiv:2203.10244},
  year={2022}
}
@inproceedings{mathew2021docvqa,
  title={Docvqa: A dataset for vqa on document images},
  author={Mathew, Minesh and Karatzas, Dimosthenis and Jawahar, CV},
  booktitle={WACV},
  year={2021}
}
@inproceedings{kafle2018dvqa,
  title={Dvqa: Understanding data visualizations via question answering},
  author={Kafle, Kushal and Price, Brian and Cohen, Scott and Kanan, Christopher},
  booktitle={CVPR},
  year={2018}
}

@inproceedings{acharya2019tallyqa,
  title={TallyQA: Answering complex counting questions},
  author={Acharya, Manoj and Kafle, Kushal and Kanan, Christopher},
  booktitle={AAAI},
  year={2019}
}
@inproceedings{johnson2017clevr,
  title={Clevr: A diagnostic dataset for compositional language and elementary visual reasoning},
  author={Johnson, Justin and Hariharan, Bharath and Van Der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C and Girshick, Ross},
  booktitle={CVPR},
  year={2017}
}
@article{tu2023many,
  title={How many unicorns are in this image? a safety evaluation benchmark for vision llms},
  author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han, Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
  journal={arXiv preprint arXiv:2311.16101},
  year={2023}
}


@inproceedings{gurari2018vizwiz,
  title={Vizwiz grand challenge: Answering visual questions from blind people},
  author={Gurari, Danna and Li, Qing and Stangl, Abigale J and Guo, Anhong and Lin, Chi and Grauman, Kristen and Luo, Jiebo and Bigham, Jeffrey P},
  booktitle={CVPR},
  year={2018}
}
@article{chen2024allava,
  title={ALLaVA: Harnessing GPT4V-synthesized Data for A Lite Vision-Language Model},
  author={Chen, Guiming Hardy and Chen, Shunian and Zhang, Ruifei and Chen, Junying and Wu, Xiangbo and Zhang, Zhiyi and Chen, Zhihong and Li, Jianquan and Wan, Xiang and Wang, Benyou},
  journal={arXiv preprint arXiv:2402.11684},
  year={2024}
}

@article{cha2024visually,
  title={Visually Dehallucinative Instruction Generation: Know What You Don't Know},
  author={Cha, Sungguk and Lee, Jusung and Lee, Younghyun and Yang, Cheoljong},
  journal={arXiv preprint arXiv:2402.09717},
  year={2024}
}
@article{si2024design2code,
  title={Design2Code: How Far Are We From Automating Front-End Engineering?},
  author={Si, Chenglei and Zhang, Yanzhe and Yang, Zhengyuan and Liu, Ruibo and Yang, Diyi},
  journal={arXiv preprint arXiv:2403.03163},
  year={2024}
}
@article{li2024multimodal,
  title={Multimodal ArXiv: A Dataset for Improving Scientific Comprehension of Large Vision-Language Models},
  author={Li, Lei and Wang, Yuqi and Xu, Runxin and Wang, Peiyi and Feng, Xiachong and Kong, Lingpeng and Liu, Qi},
  journal={arXiv preprint arXiv:2403.00231},
  year={2024}
}
@article{wang2024measuring,
  title={Measuring Multimodal Mathematical Reasoning with MATH-Vision Dataset},
  author={Wang, Ke and Pan, Junting and Shi, Weikang and Lu, Zimu and Zhan, Mingjie and Li, Hongsheng},
  journal={arXiv preprint arXiv:2402.14804},
  year={2024}
}

@article{wu2023q,
  title={Q-instruct: Improving low-level visual abilities for multi-modality foundation models},
  author={Wu, Haoning and Zhang, Zicheng and Zhang, Erli and Chen, Chaofeng and Liao, Liang and Wang, Annan and Xu, Kaixin and Li, Chunyi and Hou, Jingwen and Zhai, Guangtao and others},
  journal={arXiv preprint arXiv:2311.06783},
  year={2023}
}

@inproceedings{kembhavi2016diagram,
  title={A diagram is worth a dozen images},
  author={Kembhavi, Aniruddha and Salvato, Mike and Kolve, Eric and Seo, Minjoon and Hajishirzi, Hannaneh and Farhadi, Ali},
  booktitle={ECCV},
  year={2016},
}

@misc{laiongpt4v,
  title={laion/gpt4v-dataset},
  author={LAION},
  year={2023},
  url={https://huggingface.co/datasets/laion/gpt4v-dataset}
}

@article{hsiao2022screenqa,
  title={Screenqa: Large-scale question-answer pairs over mobile app screenshots},
  author={Hsiao, Yu-Chung and Zubach, Fedir and Wang, Maria and others},
  journal={arXiv preprint arXiv:2209.08199},
  year={2022}
}

@article{lu2022learn,
  title={Learn to explain: Multimodal reasoning via thought chains for science question answering},
  author={Lu, Pan and Mishra, Swaroop and Xia, Tanglin and Qiu, Liang and Chang, Kai-Wei and Zhu, Song-Chun and Tafjord, Oyvind and Clark, Peter and Kalyan, Ashwin},
  journal={NeurIPS},
  year={2022}
}
@article{gao2023g,
  title={G-llava: Solving geometric problem with multi-modal large language model},
  author={Gao, Jiahui and Pi, Renjie and Zhang, Jipeng and Ye, Jiacheng and Zhong, Wanjun and Wang, Yufei and Hong, Lanqing and Han, Jianhua and Xu, Hang and Li, Zhenguo and others},
  journal={arXiv preprint arXiv:2312.11370},
  year={2023}
}
@article{kim2021donut,
  title={Donut: Document understanding transformer without ocr},
  author={Kim, Geewook and Hong, Teakgyu and Yim, Moonbin and Park, Jinyoung and Yim, Jinyeong and Hwang, Wonseok and Yun, Sangdoo and Han, Dongyoon and Park, Seunghyun},
  journal={arXiv preprint arXiv:2111.15664},
  volume={7},
  number={15},
  pages={2},
  year={2021}
}
@article{laurenccon2024unlocking,
  title={Unlocking the conversion of Web Screenshots into HTML Code with the WebSight Dataset},
  author={Lauren{\c{c}}on, Hugo and Tronchon, L{\'e}o and Sanh, Victor},
  journal={arXiv preprint arXiv:2403.09029},
  year={2024}
}
@article{belouadi2023automatikz,
  title={Automatikz: Text-guided synthesis of scientific vector graphics with tikz},
  author={Belouadi, Jonas and Lauscher, Anne and Eger, Steffen},
  journal={arXiv preprint arXiv:2310.00367},
  year={2023}
}
@article{alawwad2024enhancing,
  title={Enhancing Textbook Question Answering Task with Large Language Models and Retrieval Augmented Generation},
  author={Alawwad, Hessa Abdulrahman and Alhothali, Areej and Naseem, Usman and Alkhathlan, Ali and Jamal, Amani},
  journal={arXiv preprint arXiv:2402.05128},
  year={2024}
}
@article{lu2021inter,
  title={Inter-GPS: Interpretable geometry problem solving with formal language and symbolic reasoning},
  author={Lu, Pan and Gong, Ran and Jiang, Shibiao and Qiu, Liang and Huang, Siyuan and Liang, Xiaodan and Zhu, Song-Chun},
  journal={arXiv preprint arXiv:2105.04165},
  year={2021}
}
@inproceedings{zhang2019raven,
  title={Raven: A dataset for relational and analogical visual reasoning},
  author={Zhang, Chi and Gao, Feng and Jia, Baoxiong and Zhu, Yixin and Zhu, Song-Chun},
  booktitle={CVPR},
  year={2019}
}
@article{lu2021iconqa,
  title={Iconqa: A new benchmark for abstract diagram understanding and visual language reasoning},
  author={Lu, Pan and Qiu, Liang and Chen, Jiaqi and Xia, Tony and Zhao, Yizhou and Zhang, Wei and Yu, Zhou and Liang, Xiaodan and Zhu, Song-Chun},
  journal={arXiv preprint arXiv:2110.13214},
  year={2021}
}
@article{kazemi2023geomverse,
  title={Geomverse: A systematic evaluation of large models for geometric reasoning},
  author={Kazemi, Mehran and Alvari, Hamidreza and Anand, Ankit and Wu, Jialin and Chen, Xi and Soricut, Radu},
  journal={arXiv preprint arXiv:2312.12241},
  year={2023}
}
@article{pasupat2015compositional,
  title={Compositional semantic parsing on semi-structured tables},
  author={Pasupat, Panupong and Liang, Percy},
  journal={arXiv preprint arXiv:1508.00305},
  year={2015}
}
@article{zhong2017seq2sql,
  title={Seq2sql: Generating structured queries from natural language using reinforcement learning},
  author={Zhong, Victor and Xiong, Caiming and Socher, Richard},
  journal={arXiv preprint arXiv:1709.00103},
  year={2017}
}

@article{chen2021finqa,
  title={Finqa: A dataset of numerical reasoning over financial data},
  author={Chen, Zhiyu and Chen, Wenhu and Smiley, Charese and Shah, Sameena and Borova, Iana and Langdon, Dylan and Moussa, Reema and Beane, Matt and Huang, Ting-Hao and Routledge, Bryan and others},
  journal={arXiv preprint arXiv:2109.00122},
  year={2021}
}

@article{cheng2021hitab,
  title={HiTab: A hierarchical table dataset for question answering and natural language generation},
  author={Cheng, Zhoujun and Dong, Haoyu and Wang, Zhiruo and Jia, Ran and Guo, Jiaqi and Gao, Yan and Han, Shi and Lou, Jian-Guang and Zhang, Dongmei},
  journal={arXiv preprint arXiv:2108.06712},
  year={2021}
}
@article{zhu2021tat,
  title={TAT-QA: A question answering benchmark on a hybrid of tabular and textual content in finance},
  author={Zhu, Fengbin and Lei, Wenqiang and Huang, Youcheng and Wang, Chao and Zhang, Shuo and Lv, Jiancheng and Feng, Fuli and Chua, Tat-Seng},
  journal={arXiv preprint arXiv:2105.07624},
  year={2021}
}

@article{lu2022dynamic,
  title={Dynamic prompt learning via policy gradient for semi-structured mathematical reasoning},
  author={Lu, Pan and Qiu, Liang and Chang, Kai-Wei and Wu, Ying Nian and Zhu, Song-Chun and Rajpurohit, Tanmay and Clark, Peter and Kalyan, Ashwin},
  journal={arXiv preprint arXiv:2209.14610},
  year={2022}
}
@article{kantharaj2022chart,
  title={Chart-to-text: A large-scale benchmark for chart summarization},
  author={Kantharaj, Shankar and Leong, Rixie Tiffany Ko and Lin, Xiang and Masry, Ahmed and Thakkar, Megh and Hoque, Enamul and Joty, Shafiq},
  journal={arXiv preprint arXiv:2203.06486},
  year={2022}
}
@article{tang2023vistext,
  title={Vistext: A benchmark for semantically rich chart captioning},
  author={Tang, Benny J and Boggust, Angie and Satyanarayan, Arvind},
  journal={arXiv preprint arXiv:2307.05356},
  year={2023}
}
@inproceedings{biten2022latr,
  title={Latr: Layout-aware transformer for scene-text vqa},
  author={Biten, Ali Furkan and Litman, Ron and Xie, Yusheng and Appalaraju, Srikar and Manmatha, R},
  booktitle={CVPR},
  year={2022}
}

@inproceedings{biten2019scene,
  title={Scene text visual question answering},
  author={Biten, Ali Furkan and Tito, Ruben and Mafla, Andres and Gomez, Lluis and Rusinol, Mar{\c{c}}al and Valveny, Ernest and Jawahar, CV and Karatzas, Dimosthenis},
  booktitle={ICCV},
  year={2019}
}
@article{kiela2020hateful,
  title={The hateful memes challenge: Detecting hate speech in multimodal memes},
  author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide},
  journal={NeurIPS},
  year={2020}
}
@misc{RenderedText,
  title={wendlerc/RenderedText},
  author={Chris Wendler},
  year={2023},
  url={https://huggingface.co/datasets/wendlerc/RenderedText}
}

@inproceedings{zhu2016visual7w,
  title={Visual7w: Grounded question answering in images},
  author={Zhu, Yuke and Groth, Oliver and Bernstein, Michael and Fei-Fei, Li},
  booktitle={CVPR},
  year={2016}
}

@article{tanaka2021visualmrc,
  title={VisualMRC: Machine Reading Comprehension on Document Images},
  author={Tanaka, Ryota and Nishida, Kyosuke and Yoshida, Sen},
  booktitle={AAAI},
  year={2021}
}

@article{shridhar2020alfworld,
  title={ALFWorld: Aligning Text and Embodied Environments for Interactive Learning},
  author={Shridhar, Mohit and Yuan, Xingdi and C{\^{o}}t{\'{e}}, Marc{-}Alexandre and Bisk, Yonatan and Trischler, Adam and Hausknecht, Matthew J.},
  booktitle={ICLR},
  year={2021}
}

@article{pont-tuset2019localizednarratives,
  title={Connecting Vision and Language with Localized Narratives},
  author={Pont{-}Tuset, Jordi and Uijlings, Jasper R. R. and Changpinyo, Soravit and Soricut, Radu and Ferrari, Vittorio},
  booktitle={ECCV},
  year={2020}
}

@article{he2020pathvqa,
  title={PathVQA: 30000+ Questions for Medical Visual Question Answering},
  author={He, Xuehai and Zhang, Yichen and Mou, Luntian and Xing, Eric P. and Xie, Pengtao},
  journal={CoRR},
  volume={abs/2003.10286},
  year={2020}
}


@misc{liu2023visual,
      title={Visual Instruction Tuning},
      author={Haotian Liu and Chunyuan Li and Qingyang Wu and Yong Jae Lee},
      booktitle={NeurIPS},
      year={2023}
}

@misc{chen2023sharegpt4v,
      title={ShareGPT4V: Improving Large Multi-Modal Models with Better Captions},
      author={Lin Chen and Jinsong Li and Xiaoyi Dong and Pan Zhang and Conghui He and Jiaqi Wang and Feng Zhao and Dahua Lin},
      year={2023},
      eprint={2311.12793},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{hudson2019gqa,
      title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
      author={Drew A. Hudson and Christopher D. Manning},
      year={2019},
      booktitle={CVPR},
}

@misc{marino2019okvqa,
      title={OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge},
      author={Kenneth Marino and Mohammad Rastegari and Ali Farhadi and Roozbeh Mottaghi},
      year={2019},
      eprint={1906.00067},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@article{vishniakov2023convnet,
  title={ConvNet vs Transformer, Supervised vs CLIP: Beyond ImageNet Accuracy},
  author={Vishniakov, Kirill and Shen, Zhiqiang and Liu, Zhuang},
  journal={arXiv preprint arXiv:2311.09215},
  year={2023}
}
@misc{schwenk2022aokvqa,
      title={A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge},
      author={Dustin Schwenk and Apoorv Khandelwal and Christopher Clark and Kenneth Marino and Roozbeh Mottaghi},
      year={2022},
      eprint={2206.01718},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{mishra2019OCR,
      title={OCR-VQA: Visual Question Answering by Reading Text in Images},
      authpr={Anand Mishra and Shashank Shekhar and Ajeet Kumar Singh and Anirban Chakraborty},
      year={2019},
      archivePrefix={International Conference on Document Analysis and Recognition},
      primaryClass={cs.CV}
}
@misc{sidorov2020textcaps,
      title={TextCaps: a Dataset for Image Captioning with Reading Comprehension},
      author={Oleksii Sidorov and Ronghang Hu and Marcus Rohrbach and Amanpreet Singh},
      year={2020},
      eprint={2003.12462},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@misc{yu2016modeling,
      title={Modeling Context in Referring Expressions},
      author={Licheng Yu and Patrick Poirson and Shan Yang and Alexander C. Berg and Tamara L. Berg},
      year={2016},
      eprint={1608.00272},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@article{team2024chameleon,
  title={Chameleon: Mixed-Modal Early-Fusion Foundation Models},
  author={Team, Chameleon},
  journal={arXiv preprint arXiv:2405.09818},
  year={2024}
}
@article{yu2023rlhf,
  title={Rlhf-v: Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback},
  author={Yu, Tianyu and Yao, Yuan and Zhang, Haoye and He, Taiwen and Han, Yifeng and Cui, Ganqu and Hu, Jinyi and Liu, Zhiyuan and Zheng, Hai-Tao and Sun, Maosong and others},
  journal={arXiv preprint arXiv:2312.00849},
  year={2023}
}

@article{rafailov2024direct,
  title={Direct preference optimization: Your language model is secretly a reward model},
  author={Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Manning, Christopher D and Ermon, Stefano and Finn, Chelsea},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024}
}
@misc{zhu2023starling,
  title={Starling-7b: Improving llm helpfulness \& harmlessness with rlaif},
  author={Zhu, Banghua and Frick, Evan and Wu, Tianhao and Zhu, Hanlin and Jiao, Jiantao},
  year={2023},
  publisher={November}
}
@article{ouyang2022training,
  title={Training language models to follow instructions with human feedback},
  author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
  journal={Advances in neural information processing systems},
  volume={35},
  pages={27730--27744},
  year={2022}
}
@article{dong2024rlhf,
  title={Rlhf workflow: From reward modeling to online rlhf},
  author={Dong, Hanze and Xiong, Wei and Pang, Bo and Wang, Haoxiang and Zhao, Han and Zhou, Yingbo and Jiang, Nan and Sahoo, Doyen and Xiong, Caiming and Zhang, Tong},
  journal={arXiv preprint arXiv:2405.07863},
  year={2024}
}
@article{liu2024decade,
  title={A Decade's Battle on Dataset Bias: Are We There Yet?},
  author={Liu, Zhuang and He, Kaiming},
  journal={arXiv preprint arXiv:2403.08632},
  year={2024}
}
@inproceedings{yuksekgonul2022and,
  title={When and why vision-language models behave like bags-of-words, and what to do about it?},
  author={Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James},
  booktitle={The Eleventh International Conference on Learning Representations},
  year={2022}
}
@article{tong2024mass,
  title={Mass-producing failures of multimodal systems with language models},
  author={Tong, Shengbang and Jones, Erik and Steinhardt, Jacob},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024}
}
@misc{krishna2016visual,
      title={Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations},
      author={Ranjay Krishna and Yuke Zhu and Oliver Groth and Justin Johnson and Kenji Hata and Joshua Kravitz and Stephanie Chen and Yannis Kalantidis and Li-Jia Li and David A. Shamma and Michael S. Bernstein and Fei-Fei Li},
      year={2016},
      eprint={1602.07332},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@article{tong2024eyes,
  title={Eyes wide shut? exploring the visual shortcomings of multimodal llms},
  author={Tong, Shengbang and Liu, Zhuang and Zhai, Yuexiang and Ma, Yi and LeCun, Yann and Xie, Saining},
  journal={arXiv preprint arXiv:2401.06209},
  year={2024}
}

@article{liu2023improved,
  title={Improved baselines with visual instruction tuning},
  author={Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
  journal={arXiv preprint arXiv:2310.03744},
  year={2023}
}


@article{mckinzie2024mm1,
  title={Mm1: Methods, analysis \& insights from multimodal llm pre-training},
  author={McKinzie, Brandon and Gan, Zhe and Fauconnier, Jean-Philippe and Dodge, Sam and Zhang, Bowen and Dufter, Philipp and Shah, Dhruti and Du, Xianzhi and Peng, Futang and Weers, Floris and others},
  journal={arXiv preprint arXiv:2403.09611},
  year={2024}
}


@article{xu2023demystifying,
  title={Demystifying clip data},
  author={Xu, Hu and Xie, Saining and Tan, Xiaoqing Ellen and Huang, Po-Yao and Howes, Russell and Sharma, Vasu and Li, Shang-Wen and Ghosh, Gargi and Zettlemoyer, Luke and Feichtenhofer, Christoph},
  journal={arXiv preprint arXiv:2309.16671},
  year={2023}
}

@article{fang2023data,
  title={Data filtering networks},
  author={Fang, Alex and Jose, Albin Madappally and Jain, Amit and Schmidt, Ludwig and Toshev, Alexander and Shankar, Vaishaal},
  journal={arXiv preprint arXiv:2309.17425},
  year={2023}
}
@article{gao2024sphinx,
  title={SPHINX-X: Scaling Data and Parameters for a Family of Multi-modal Large Language Models},
  author={Gao, Peng and Zhang, Renrui and Liu, Chris and Qiu, Longtian and Huang, Siyuan and Lin, Weifeng and Zhao, Shitian and Geng, Shijie and Lin, Ziyi and Jin, Peng and others},
  journal={arXiv preprint arXiv:2402.05935},
  year={2024}
}

@online{DatabricksBlog2023DollyV2,
    author    = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},
    title     = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},
    year      = {2023},
    url       = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},
    urldate   = {2023-06-30}
}

@article{yue2023mammoth,
  title={Mammoth: Building math generalist models through hybrid instruction tuning},
  author={Yue, Xiang and Qu, Xingwei and Zhang, Ge and Fu, Yao and Huang, Wenhao and Sun, Huan and Su, Yu and Chen, Wenhu},
  journal={arXiv preprint arXiv:2309.05653},
  year={2023}
}

@article{luo2023wizardcoder,
  title={Wizardcoder: Empowering code large language models with evol-instruct},
  author={Luo, Ziyang and Xu, Can and Zhao, Pu and Sun, Qingfeng and Geng, Xiubo and Hu, Wenxiang and Tao, Chongyang and Ma, Jing and Lin, Qingwei and Jiang, Daxin},
  journal={arXiv preprint arXiv:2306.08568},
  year={2023}
}

@misc{mitra2024orcamath,
      title={Orca-Math: Unlocking the potential of SLMs in Grade School Math},
      author={Arindam Mitra and Hamed Khanpour and Corby Rosset and Ahmed Awadallah},
      year={2024},
      eprint={2402.14830},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}


@article{zheng2024opencodeinterpreter,
  title={OpenCodeInterpreter: Integrating Code Generation with Execution and Refinement},
  author={Zheng, Tianyu and Zhang, Ge and Shen, Tianhao and Liu, Xueling and Lin, Bill Yuchen and Fu, Jie and Chen, Wenhu and Yue, Xiang},
  journal={arXiv preprint arXiv:2402.14658},
  year={2024}
}

@misc{OpenOrca,
  title = {OpenOrca: An Open Dataset of GPT Augmented FLAN Reasoning Traces},
  author = {Wing Lian and Bleys Goodson and Eugene Pentland and Austin Cook and Chanvichet Vong and "Teknium"},
  year = {2023},
  publisher = {HuggingFace},
  journal = {HuggingFace repository},
  howpublished = {\url{https://https://huggingface.co/Open-Orca/OpenOrca}},
}

@inproceedings{radford2021learning,
  title={Learning transferable visual models from natural language supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others},
  booktitle={International conference on machine learning},
  pages={8748--8763},
  year={2021},
  organization={PMLR}
}
@article{schuhmann2022laion,
  title={Laion-5b: An open large-scale dataset for training next generation image-text models},
  author={Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and others},
  journal={NeurIPS},
  volume={35},
  pages={25278--25294},
  year={2022}
}
@article{zheng2024judging,
  title={Judging llm-as-a-judge with mt-bench and chatbot arena},
  author={Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric and others},
  journal={NeurIPS},
  volume={36},
  year={2024}
}
@article{fang2023data,
  title={Data filtering networks},
  author={Fang, Alex and Jose, Albin Madappally and Jain, Amit and Schmidt, Ludwig and Toshev, Alexander and Shankar, Vaishaal},
  journal={arXiv preprint arXiv:2309.17425},
  year={2023}
}

@inproceedings{zhai2023sigmoid,
  title={Sigmoid loss for language image pre-training},
  author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
  booktitle={ICCV},
  pages={11975--11986},
  year={2023}
}
@article{sun2023eva,
  title={Eva-clip: Improved training techniques for clip at scale},
  author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
  journal={arXiv preprint arXiv:2303.15389},
  year={2023}
}
@inproceedings{liu2022convnet,
  title={A convnet for the 2020s},
  author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
  booktitle={CVPR},
  pages={11976--11986},
  year={2022}
}
@inproceedings{cherti2023reproducible,
  title={Reproducible scaling laws for contrastive language-image learning},
  author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
  booktitle={CVPR},
  pages={2818--2829},
  year={2023}
}

@inproceedings{he2022masked,
  title={Masked autoencoders are scalable vision learners},
  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
  booktitle={CVPR},
  pages={16000--16009},
  year={2022}
}
@inproceedings{chen2021empirical,
  title={An empirical study of training self-supervised vision transformers},
  author={Chen, Xinlei and Xie, Saining and He, Kaiming},
  booktitle={ICCV},
  pages={9640--9649},
  year={2021}
}

@article{oquab2023dinov2,
  title={Dinov2: Learning robust visual features without supervision},
  author={Oquab, Maxime and Darcet, Timoth{\'e}e and Moutakanni, Th{\'e}o and Vo, Huy and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and others},
  journal={arXiv preprint arXiv:2304.07193},
  year={2023}
}
@inproceedings{assran2023self,
  title={Self-supervised learning from images with a joint-embedding predictive architecture},
  author={Assran, Mahmoud and Duval, Quentin and Misra, Ishan and Bojanowski, Piotr and Vincent, Pascal and Rabbat, Michael and LeCun, Yann and Ballas, Nicolas},
  booktitle={CVPR},
  year={2023}
}
@article{dosovitskiy2020image,
  title={An image is worth 16x16 words: Transformers for image recognition at scale},
  author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
  journal={arXiv preprint arXiv:2010.11929},
  year={2020}
}
@inproceedings{jouppi2023tpu,
  title={Tpu v4: An optically reconfigurable supercomputer for machine learning with hardware support for embeddings},
  author={Jouppi, Norm and Kurian, George and Li, Sheng and Ma, Peter and Nagarajan, Rahul and Nai, Lifeng and Patil, Nishant and Subramanian, Suvinay and Swing, Andy and Towles, Brian and others},
  booktitle={Proceedings of the 50th Annual International Symposium on Computer Architecture},
  pages={1--14},
  year={2023}
}
@article{zhao2023pytorch,
  title={Pytorch fsdp: experiences on scaling fully sharded data parallel},
  author={Zhao, Yanli and Gu, Andrew and Varma, Rohan and Luo, Liang and Huang, Chien-Chin and Xu, Min and Wright, Less and Shojanazeri, Hamid and Ott, Myle and Shleifer, Sam and others},
  journal={arXiv preprint arXiv:2304.11277},
  year={2023}
}
@article{zhou2023don,
  title={Don't Make Your LLM an Evaluation Benchmark Cheater},
  author={Zhou, Kun and Zhu, Yutao and Chen, Zhipeng and Chen, Wentong and Zhao, Wayne Xin and Chen, Xu and Lin, Yankai and Wen, Ji-Rong and Han, Jiawei},
  journal={arXiv preprint arXiv:2311.01964},
  year={2023}
}
@inproceedings{kirillov2023segment,
  title={Segment anything},
  author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C and Lo, Wan-Yen and others},
  booktitle={ICCV},
  pages={4015--4026},
  year={2023}
}
@article{birkl2023midas,
  title={Midas v3. 1--a model zoo for robust monocular relative depth estimation},
  author={Birkl, Reiner and Wofk, Diana and M{\"u}ller, Matthias},
  journal={arXiv preprint arXiv:2307.14460},
  year={2023}
}

@article{lasinger2019towards,
  title={Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer},
  author={Lasinger, Katrin and Ranftl, Ren{\'e} and Schindler, Konrad and Koltun, Vladlen},
  journal={arXiv preprint arXiv:1907.01341},
  year={2019}
}


@InProceedings{Rombach_2022_CVPR,
    author    = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
    title     = {High-Resolution Image Synthesis With Latent Diffusion Models},
    booktitle = {CVPR (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {10684-10695}
}

@article{karamcheti2024prismatic,
  title={Prismatic vlms: Investigating the design space of visually-conditioned language models},
  author={Karamcheti, Siddharth and Nair, Suraj and Balakrishna, Ashwin and Liang, Percy and Kollar, Thomas and Sadigh, Dorsa},
  journal={arXiv preprint arXiv:2402.07865},
  year={2024}
}

@article{zhai2023investigating,
  title={Investigating the catastrophic forgetting in multimodal large language models},
  author={Zhai, Yuexiang and Tong, Shengbang and Li, Xiao and Cai, Mu and Qu, Qing and Lee, Yong Jae and Ma, Yi},
  journal={arXiv preprint arXiv:2309.10313},
  year={2023}
}
@misc{liu2024llavanext,
    title={LLaVA-NeXT: Improved reasoning, OCR, and world knowledge},
    url={https://llava-vl.github.io/blog/2024-01-30-llava-next/},
    author={Liu, Haotian and Li, Chunyuan and Li, Yuheng and Li, Bo and Zhang, Yuanhan and Shen, Sheng and Lee, Yong Jae},
    month={January},
    year={2024}
}
@article{lu2024deepseek,
  title={DeepSeek-VL: towards real-world vision-language understanding},
  author={Lu, Haoyu and Liu, Wen and Zhang, Bo and Wang, Bingxuan and Dong, Kai and Liu, Bo and Sun, Jingxiang and Ren, Tongzheng and Li, Zhuoshu and Sun, Yaofeng and others},
  journal={arXiv preprint arXiv:2403.05525},
  year={2024}
}
@inproceedings{li2023your,
  title={Your diffusion model is secretly a zero-shot classifier},
  author={Li, Alexander C and Prabhudesai, Mihir and Duggal, Shivam and Brown, Ellis and Pathak, Deepak},
  booktitle={ICCV},
  pages={2206--2217},
  year={2023}
}
@article{chen2022pali,
  title={Pali: A jointly-scaled multilingual language-image model},
  author={Chen, Xi and Wang, Xiao and Changpinyo, Soravit and Piergiovanni, AJ and Padlewski, Piotr and Salz, Daniel and Goodman, Sebastian and Grycner, Adam and Mustafa, Basil and Beyer, Lucas and others},
  journal={arXiv preprint arXiv:2209.06794},
  year={2022}
}
@article{murtagh2014ward,
  title={Ward’s hierarchical agglomerative clustering method: which algorithms implement Ward’s criterion?},
  author={Murtagh, Fionn and Legendre, Pierre},
  journal={Journal of classification},
  volume={31},
  pages={274--295},
  year={2014},
  publisher={Springer}
}
@article{llama3modelcard,

title={Llama 3 Model Card},

author={AI@Meta},

year={2024},

url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}

}

@misc{Gemini,
  title={Gemini},
  author={Google},
  year={2023},
  url={https://blog.google/technology/ai/google-gemini-ai/}
}

@article{qwen,
  title={Qwen Technical Report},
  author={Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu},
  journal={arXiv preprint arXiv:2309.16609},
  year={2023}
}
@article{bai2023qwen,
  title={Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond},
  author={Bai, Jinze and Bai, Shuai and Yang, Shusheng and Wang, Shijie and Tan, Sinan and Wang, Peng and Lin, Junyang and Zhou, Chang and Zhou, Jingren},
  year={2023}
}
@article{dai2024instructblip,
  title={Instructblip: Towards general-purpose vision-language models with instruction tuning},
  author={Dai, Wenliang and Li, Junnan and Li, Dongxu and Tiong, Anthony Meng Huat and Zhao, Junqi and Wang, Weisheng and Li, Boyang and Fung, Pascale N and Hoi, Steven},
  journal={NeurIPS},
  volume={36},
  year={2024}
}
@article{liu2023hidden,
  title={On the hidden mystery of ocr in large multimodal models},
  author={Liu, Yuliang and Li, Zhang and Li, Hongliang and Yu, Wenwen and Huang, Mingxin and Peng, Dezhi and Liu, Mingyu and Chen, Mingrui and Li, Chunyuan and Jin, Lianwen and others},
  journal={arXiv preprint arXiv:2305.07895},
  year={2023}
}
@article{ge2023planting,
  title={Planting a seed of vision in large language model},
  author={Ge, Yuying and Ge, Yixiao and Zeng, Ziyun and Wang, Xintao and Shan, Ying},
  journal={arXiv preprint arXiv:2307.08041},
  year={2023}
}

@article{wu2023vstar,
  title={V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs},
  author={Wu, Penghao and Xie, Saining},
  journal={CVPR},
  year={2024}
}
@inproceedings{jaegle2021perceiver,
  title={Perceiver: General perception with iterative attention},
  author={Jaegle, Andrew and Gimeno, Felix and Brock, Andy and Vinyals, Oriol and Zisserman, Andrew and Carreira, Joao},
  booktitle={International conference on machine learning},
  pages={4651--4664},
  year={2021},
  organization={PMLR}
}
@article{young2024yi,
  title={Yi: Open foundation models by 01. ai},
  author={Young, Alex and Chen, Bei and Li, Chao and Huang, Chengen and Zhang, Ge and Zhang, Guanwei and Li, Heng and Zhu, Jiangcheng and Chen, Jianqun and Chang, Jing and others},
  journal={arXiv preprint arXiv:2403.04652},
  year={2024}
}
@article{zhai2024fine,
  title={Fine-Tuning Large Vision-Language Models as Decision-Making Agents via Reinforcement Learning},
  author={Zhai, Yuexiang and Bai, Hao and Lin, Zipeng and Pan, Jiayi and Tong, Shengbang and Zhou, Yifei and Suhr, Alane and Xie, Saining and LeCun, Yann and Ma, Yi and others},
  journal={arXiv preprint arXiv:2405.10292},
  year={2024}
}
@article{lu2023mathvista,
  title={Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts},
  author={Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi, Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
  journal={arXiv preprint arXiv:2310.02255},
  year={2023}
}
@article{liu2023mmbench,
  title={Mmbench: Is your multi-modal model an all-around player?},
  author={Liu, Yuan and Duan, Haodong and Zhang, Yuanhan and Li, Bo and Zhang, Songyang and Zhao, Wangbo and Yuan, Yike and Wang, Jiaqi and He, Conghui and Liu, Ziwei and others},
  journal={arXiv preprint arXiv:2307.06281},
  year={2023}
}

@article{alayrac2022flamingo,
  title={Flamingo: a visual language model for few-shot learning},
  author={Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katherine and Reynolds, Malcolm and others},
  booktitle={NeurIPS},
  year={2022}
}
@inproceedings{li2023oxfordtvg,
  title={OxfordTVG-HIC: Can Machine Make Humorous Captions from Images?},
  author={Li, Runjia and Sun, Shuyang and Elhoseiny, Mohamed and Torr, Philip},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={20293--20303},
  year={2023}
}
@article{gadre2024datacomp,
  title={Datacomp: In search of the next generation of multimodal datasets},
  author={Gadre, Samir Yitzhak and Ilharco, Gabriel and Fang, Alex and Hayase, Jonathan and Smyrnis, Georgios and Nguyen, Thao and Marten, Ryan and Wortsman, Mitchell and Ghosh, Dhruba and Zhang, Jieyu and others},
  journal={NeurIPS},
  volume={36},
  year={2024}
}
@article{banani2024probing,
  title={Probing the 3D Awareness of Visual Foundation Models},
  author={Banani, Mohamed El and Raj, Amit and Maninis, Kevis-Kokitsi and Kar, Abhishek and Li, Yuanzhen and Rubinstein, Michael and Sun, Deqing and Guibas, Leonidas and Johnson, Justin and Jampani, Varun},
  journal={arXiv preprint arXiv:2404.08636},
  year={2024}
}
@misc{OpenAI2022ChatGPT,
  title={ChatGPT},
  author={OpenAI},
  year={2022},
  url={https://openai.com/blog/chatgpt}
}
@misc{Sanseviero2024LLM,
  title={LLM Evals and Benchmarking},
  author={Omar Sanseviero},
  year={2022},
  url={https://osanseviero.github.io/hackerllama/}
}

@article{xu2023demystifying,
  title={Demystifying clip data},
  author={Xu, Hu and Xie, Saining and Tan, Xiaoqing Ellen and Huang, Po-Yao and Howes, Russell and Sharma, Vasu and Li, Shang-Wen and Ghosh, Gargi and Zettlemoyer, Luke and Feichtenhofer, Christoph},
  journal={arXiv preprint arXiv:2309.16671},
  year={2023}
}

@article{chen2023sharegpt4v,
  title={Sharegpt4v: Improving large multi-modal models with better captions},
  author={Chen, Lin and Li, Jisong and Dong, Xiaoyi and Zhang, Pan and He, Conghui and Wang, Jiaqi and Zhao, Feng and Lin, Dahua},
  journal={arXiv preprint arXiv:2311.12793},
  year={2023}
}
@misc{grok,
  title={grok},
  author={xAI},
  year={2024},
  url={https://x.ai/blog/grok-1.5v}
}
@inproceedings{singh2019towards,
  title={Towards vqa models that can read},
  author={Singh, Amanpreet and Natarajan, Vivek and Shah, Meet and Jiang, Yu and Chen, Xinlei and Batra, Dhruv and Parikh, Devi and Rohrbach, Marcus},
  booktitle={CVPR},
  pages={8317--8326},
  year={2019}
}
@article{chang2024survey,
  title={A survey on evaluation of large language models},
  author={Chang, Yupeng and Wang, Xu and Wang, Jindong and Wu, Yuan and Yang, Linyi and Zhu, Kaijie and Chen, Hao and Yi, Xiaoyuan and Wang, Cunxiang and Wang, Yidong and others},
  journal={ACM Transactions on Intelligent Systems and Technology},
  volume={15},
  number={3},
  pages={1--45},
  year={2024},
  publisher={ACM New York, NY}
}
@misc{OpenAI2024gpt4o,
  title={gpt4o},
  author={OpenAI},
  year={2024},
  url={https://openai.com/index/hello-gpt-4o/}
}


@article{touvron2023llama,
  title={{LLaMA}: Open and efficient foundation language models},
  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
  journal={arXiv preprint arXiv:2302.13971},
  year={2023}
}
@article{touvron2023llama2,
  title={{LLaMA} 2: Open foundation and fine-tuned chat models},
  author={Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and others},
  booktitle={arXiv preprint arXiv:2307.09288},
  year={2023}
}

@misc{li2024llavanext-strong,
    title={LLaVA-NeXT: Stronger LLMs Supercharge Multimodal Capabilities in the Wild},
    url={https://llava-vl.github.io/blog/2024-05-10-llava-next-stronger-llms/},
    author={Li, Bo and Zhang, Kaichen and Zhang, Hao and Guo, Dong and Zhang, Renrui and Li, Feng and Zhang, Yuanhan and Liu, Ziwei and Li, Chunyuan},
    month={May},
    year={2024}
}
@article{yue2023mmmu,
  title={Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi},
  author={Yue, Xiang and Ni, Yuansheng and Zhang, Kai and Zheng, Tianyu and Liu, Ruoqi and Zhang, Ge and Stevens, Samuel and Jiang, Dongfu and Ren, Weiming and Sun, Yuxuan and others},
  journal={arXiv preprint arXiv:2311.16502},
  year={2023}
}
@article{hiippala2021ai2d,
  title={AI2D-RST: A multimodal corpus of 1000 primary school science diagrams},
  author={Hiippala, Tuomo and Alikhani, Malihe and Haverinen, Jonas and Kalliokoski, Timo and Logacheva, Evanfiya and Orekhova, Serafina and Tuomainen, Aino and Stone, Matthew and Bateman, John A},
  journal={Language Resources and Evaluation},
  volume={55},
  pages={661--688},
  year={2021},
  publisher={Springer}
}

@inproceedings{brazil2023omni3d,
  title={Omni3d: A large benchmark and model for 3d object detection in the wild},
  author={Brazil, Garrick and Kumar, Abhinav and Straub, Julian and Ravi, Nikhila and Johnson, Justin and Gkioxari, Georgia},
  booktitle={CVPR},
  pages={13154--13164},
  year={2023}
}

@article{zhou2019semantic,
  title={Semantic understanding of scenes through the ade20k dataset},
  author={Zhou, Bolei and Zhao, Hang and Puig, Xavier and Xiao, Tete and Fidler, Sanja and Barriuso, Adela and Torralba, Antonio},
  journal={IJCV},
  volume={127},
  pages={302--321},
  year={2019},
  publisher={Springer}
}
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{fu2024blink,
  title={BLINK: Multimodal Large Language Models Can See but Not Perceive},
  author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
  journal={arXiv preprint arXiv:2404.12390},
  year={2024}
}
@article{lu2023mathvista,
  title={Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts},
  author={Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi, Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
  journal={arXiv preprint arXiv:2310.02255},
  year={2023}
}


@article{russakovsky2015imagenet,
  title={Imagenet large scale visual recognition challenge},
  author={Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and others},
  journal={IJCV},
  volume={115},
  pages={211--252},
  year={2015},
  publisher={Springer}
}

@book{aquinas,
  title     = {Quaestiones Disputatae de Veritate},
  author    = {Thomas Aquinas},
  year      = 1259,
  address   = "q.2 a.3 arg.19"
}
@book{aristotle-metaphysics-350BCE,
  added-at = {2011-06-06T22:32:14.000+0200},
  author = {Aristotle},
  biburl = {https://www.bibsonomy.org/bibtex/205aed2bf4d0b39ab66d998142b5608cd/mhwombat},
  editor = {by W. D. Ross, Translated},
  groups = {public},
  interhash = {9a4a19db0a4c4ac757b2e4fb7dddadeb},
  intrahash = {05aed2bf4d0b39ab66d998142b5608cd},
  keywords = {MSc _checked philosophy},
  publisher = {The Internet Classics Archive},