-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibliography.bib
835 lines (753 loc) · 36.5 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
@article{abdin2024phi3technicalreporthighly,
author = {Abdin, Marah and Jacobs, Sam Ade and Awan, Ammar Ahmad and Aneja, Jyoti and Awadallah, Ahmed and Awadalla, Hany and Bach, Nguyen and Bahree, Amit and Bakhtiari, Arash and Behl, Harkirat and others},
journal = {ArXiv preprint},
title = {Phi-3 technical report: A highly capable language model locally on your phone},
url = {https://arxiv.org/abs/2404.14219},
volume = {abs/2404.14219},
year = {2024}
}
@inproceedings{ahia-etal-2023-languages,
abstract = {Language models have graduated from being research prototypes to commercialized products offered as web APIs, and recent works have highlighted the multilingual capabilities of these products. The API vendors charge their users based on usage, more specifically on the number of {``}tokens{''} processed or generated by the underlying language models. What constitutes a token, however, is training data and model dependent with a large variance in the number of tokens required to convey the same information in different languages. In this work, we analyze the effect of this non-uniformity on the fairness of an API{'}s pricing policy across languages. We conduct a systematic analysis of the cost and utility of OpenAI{'}s language model API on multilingual benchmarks in 22 typologically diverse languages. We show evidence that speakers of a large number of the supported languages are overcharged while obtaining poorer results. These speakers tend to also come from regions where the APIs are less affordable, to begin with. Through these analyses, we aim to increase transparency around language model APIs{'} pricing policies and encourage the vendors to make them more equitable.},
author = {Ahia, Orevaoghene and
Kumar, Sachin and
Gonen, Hila and
Kasai, Jungo and
Mortensen, David and
Smith, Noah and
Tsvetkov, Yulia},
booktitle = {Proc. of EMNLP},
editor = {Bouamor, Houda and
Pino, Juan and
Bali, Kalika},
pages = {9904--9923},
title = {Do All Languages Cost the Same? Tokenization in the Era of Commercial Language Models},
year = {2023}
}
@inproceedings{alkhamissi-etal-2024-investigating,
abstract = {The intricate relationship between language and culture has long been a subject of exploration within the realm of linguistic anthropology. Large Language Models (LLMs), promoted as repositories of collective human knowledge, raise a pivotal question: do these models genuinely encapsulate the diverse knowledge adopted by different cultures? Our study reveals that these models demonstrate greater cultural alignment along two dimensions{---}firstly, when prompted with the dominant language of a specific culture, and secondly, when pretrained with a refined mixture of languages employed by that culture. We quantify cultural alignment by simulating sociological surveys, comparing model responses to those of actual survey participants as references. Specifically, we replicate a survey conducted in various regions of Egypt and the United States through prompting LLMs with different pretraining data mixtures in both Arabic and English with the personas of the real respondents and the survey questions. Further analysis reveals that misalignment becomes more pronounced for underrepresented personas and for culturally sensitive topics, such as those probing social values. Finally, we introduce Anthropological Prompting, a novel method leveraging anthropological reasoning to enhance cultural alignment. Our study emphasizes the necessity for a more balanced multilingual pretraining dataset to better represent the diversity of human experience and the plurality of different cultures with many implications on the topic of cross-lingual transfer.},
author = {AlKhamissi, Badr and
ElNokrashy, Muhammad and
Alkhamissi, Mai and
Diab, Mona},
booktitle = {Proc. of ACL},
editor = {Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek},
pages = {12404--12422},
title = {Investigating Cultural Alignment of Large Language Models},
year = {2024}
}
@incollection{Bengio+chapter2007,
author = {Bengio, Yoshua and LeCun, Yann},
booktitle = {Large Scale Kernel Machines},
title = {Scaling Learning Algorithms Towards {AI}},
year = {2007}
}
@article{beyer2024paligemma,
author = {Lucas Beyer and Andreas Steiner and André Susano Pinto and Alexander Kolesnikov and Xiao Wang and Daniel Salz and Maxim Neumann and Ibrahim Alabdulmohsin and Michael Tschannen and Emanuele Bugliarello and Thomas Unterthiner and Daniel Keysers and Skanda Koppula and Fangyu Liu and Adam Grycner and Alexey Gritsenko and Neil Houlsby and Manoj Kumar and Keran Rong and Julian Eisenschlos and Rishabh Kabra and Matthias Bauer and Matko Bošnjak and Xi Chen and Matthias Minderer and Paul Voigtlaender and Ioana Bica and Ivana Balazevic and Joan Puigcerver and Pinelopi Papalampidi and Olivier Henaff and Xi Xiong and Radu Soricut and Jeremiah Harmsen and Xiaohua Zhai},
journal = {ArXiv preprint},
title = {{PaliGemma: A versatile 3B VLM for transfer}},
url = {https://arxiv.org/abs/2407.07726},
volume = {abs/2407.07726},
year = {2024}
}
@article{biderman2024lessons,
author = {Biderman, Stella and Schoelkopf, Hailey and Sutawika, Lintang and Gao, Leo and Tow, Jonathan and Abbasi, Baber and Aji, Alham Fikri and Ammanamanchi, Pawan Sasanka and Black, Sidney and Clive, Jordan and others},
journal = {ArXiv preprint},
title = {Lessons from the Trenches on Reproducible Evaluation of Language Models},
url = {https://arxiv.org/abs/2405.14782},
volume = {abs/2405.14782},
year = {2024}
}
@inproceedings{blasi-etal-2022-systematic,
address = {Dublin, Ireland},
author = {Blasi, Damian and
Anastasopoulos, Antonios and
Neubig, Graham},
booktitle = {Proc. of ACL},
pages = {5486--5505},
publisher = {Association for Computational Linguistics},
title = {Systematic Inequalities in Language Technology Performance across the World{'}s Languages},
url = {https://aclanthology.org/2022.acl-long.376},
year = {2022}
}
@article{changpinyo2022maxm,
author = {Changpinyo, Soravit and Xue, Linting and Yarom, Michal and Thapliyal, Ashish V and Szpektor, Idan and Amelot, Julien and Chen, Xi and Soricut, Radu},
journal = {ArXiv preprint},
title = {MaXM: Towards multilingual visual question answering},
url = {https://arxiv.org/abs/2209.05401},
volume = {abs/2209.05401},
year = {2022}
}
@article{chen2023pali,
author = {Chen, Xi and Djolonga, Josip and Padlewski, Piotr and Mustafa, Basil and Changpinyo, Soravit and Wu, Jialin and Ruiz, Carlos Riquelme and Goodman, Sebastian and Wang, Xiao and Tay, Yi and others},
journal = {ArXiv preprint},
title = {Pali-x: On scaling up a multilingual vision and language model},
url = {https://arxiv.org/abs/2305.18565},
volume = {abs/2305.18565},
year = {2023}
}
@article{chen2024allava,
author = {Chen, Guiming Hardy and Chen, Shunian and Zhang, Ruifei and Chen, Junying and Wu, Xiangbo and Zhang, Zhiyi and Chen, Zhihong and Li, Jianquan and Wan, Xiang and Wang, Benyou},
journal = {ArXiv preprint},
title = {Allava: Harnessing gpt4v-synthesized data for a lite vision-language model},
url = {https://arxiv.org/abs/2402.11684},
volume = {abs/2402.11684},
year = {2024}
}
@misc{ChineseLLaVA,
author = {LinkSoul-AI},
note = {Accessed: 2024-10-01},
title = {Chinese-LLaVA},
year = {2023}
}
@misc{ChineseLLaVA_Med,
author = {BUAA},
note = {Accessed: 2024-10-01},
title = {Chinese-LLaVA-Med},
year = {2023}
}
@article{clark2020tydi,
address = {Cambridge, MA},
author = {Clark, Jonathan H. and
Choi, Eunsol and
Collins, Michael and
Garrette, Dan and
Kwiatkowski, Tom and
Nikolaev, Vitaly and
Palomaki, Jennimaria},
journal = {Transactions of the Association for Computational Linguistics},
pages = {454--470},
publisher = {MIT Press},
title = {{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages},
url = {https://aclanthology.org/2020.tacl-1.30},
volume = {8},
year = {2020}
}
@article{deepmind_gemini_report,
author = {Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Wu, Yonghui and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and others},
journal = {ArXiv preprint},
title = {Gemini: a family of highly capable multimodal models},
url = {https://arxiv.org/abs/2312.11805},
volume = {abs/2312.11805},
year = {2023}
}
@misc{deepvk2024gqa_ru,
author = {Belopolskih, Daniil and Spirin, Egor},
title = {GQA-ru},
year = {2024}
}
@article{deitke2024molmo,
author = {Deitke, Matt and Clark, Christopher and Lee, Sangho and Tripathi, Rohun and Yang, Yue and Park, Jae Sung and Salehi, Mohammadreza and Muennighoff, Niklas and Lo, Kyle and Soldaini, Luca and others},
journal = {ArXiv preprint},
title = {Molmo and PixMo: Open Weights and Open Data for State-of-the-Art Multimodal Models},
url = {https://arxiv.org/abs/2409.17146},
volume = {abs/2409.17146},
year = {2024}
}
@article{doan2024vintern,
author = {Doan, Khang T and Huynh, Bao G and Hoang, Dung T and Pham, Thuc D and Pham, Nhat H and Nguyen, Quan and Vo, Bang Q and Hoang, Suong N},
journal = {ArXiv preprint},
title = {Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese},
url = {https://arxiv.org/abs/2408.12480},
volume = {abs/2408.12480},
year = {2024}
}
@article{dubey2024llama,
author = {Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
journal = {ArXiv preprint},
title = {The llama 3 herd of models},
url = {https://arxiv.org/abs/2407.21783},
volume = {abs/2407.21783},
year = {2024}
}
@inproceedings{geigle_etal_2024_mblip,
author = {Geigle, Gregor and
Jain, Abhay and
Timofte, Radu and
Glava{\v{s}}, Goran},
booktitle = {Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)},
editor = {Gu, Jing and
Fu, Tsu-Jui (Ray) and
Hudson, Drew and
Celikyilmaz, Asli and
Wang, William},
pages = {7--25},
title = {m{BLIP}: Efficient Bootstrapping of Multilingual Vision-{LLM}s},
year = {2024}
}
@inproceedings{goodfellow2016deep,
author = {Ruslan Salakhutdinov},
biburl = {https://dblp.org/rec/conf/kdd/Salakhutdinov14.bib},
booktitle = {Proc. of KDD},
editor = {Sofus A. Macskassy and
Claudia Perlich and
Jure Leskovec and
Wei Wang and
Rayid Ghani},
pages = {1973},
publisher = {{ACM}},
timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
title = {Deep learning},
url = {https://doi.org/10.1145/2623330.2630809},
year = {2014}
}
@misc{gpt4o,
author = {OpenAI},
key = {gpt-4o},
title = {Hello gpt4-o. https://openai.com/index/hello-gpt-4o/},
year = {2024}
}
@inproceedings{gupta2019lvis,
author = {Agrim Gupta and
Piotr Doll{\'{a}}r and
Ross B. Girshick},
biburl = {https://dblp.org/rec/conf/cvpr/GuptaDG19.bib},
booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
2019, Long Beach, CA, USA, June 16-20, 2019},
pages = {5356--5364},
publisher = {Computer Vision Foundation / {IEEE}},
timestamp = {Mon, 20 Jan 2020 00:00:00 +0100},
title = {{LVIS:} {A} Dataset for Large Vocabulary Instance Segmentation},
url = {http://openaccess.thecvf.com/content\_CVPR\_2019/html/Gupta\_LVIS\_A\_Dataset\_for\_Large\_Vocabulary\_Instance\_Segmentation\_CVPR\_2019\_paper.html},
year = {2019}
}
@inproceedings{Hamborg2017,
author = {Hamborg, Felix and Meuschke, Norman and Breitinger, Corinna and Gipp, Bela},
booktitle = {Proceedings of the 15th International Symposium of Information Science},
location = {Berlin},
pages = {218--223},
title = {news-please: A Generic News Crawler and Extractor},
year = {2017}
}
@article{han2023reading,
author = {Han, Seungju and Kim, Junhyeok and Hessel, Jack and Jiang, Liwei and Chung, Jiwan and Son, Yejin and Choi, Yejin and Yu, Youngjae},
journal = {ArXiv preprint},
title = {Reading Books is Great, But Not if You Are Driving! Visually Grounded Reasoning about Defeasible Commonsense Norms},
url = {https://arxiv.org/abs/2310.10418},
volume = {abs/2310.10418},
year = {2023}
}
@inproceedings{hendrycksmeasuring2021,
author = {Dan Hendrycks and
Collin Burns and
Steven Basart and
Andy Zou and
Mantas Mazeika and
Dawn Song and
Jacob Steinhardt},
biburl = {https://dblp.org/rec/conf/iclr/HendrycksBBZMSS21.bib},
booktitle = {Proc. of ICLR},
publisher = {OpenReview.net},
timestamp = {Wed, 23 Jun 2021 01:00:00 +0200},
title = {Measuring Massive Multitask Language Understanding},
url = {https://openreview.net/forum?id=d7KBjmI3GmQ},
year = {2021}
}
@inproceedings{hessel2023androids,
author = {Hessel, Jack and Marasovi{\'c}, Ana and Hwang, Jena D and Lee, Lillian and Da, Jeff and Zellers, Rowan and Mankoff, Robert and Choi, Yejin},
booktitle = {Proc. of ACL},
pages = {688--714},
title = {Do Androids Laugh at Electric Sheep? Humor “Understanding” Benchmarks from The New Yorker Caption Contest},
year = {2023}
}
@article{Hinton06,
author = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee Whye},
journal = {Neural Computation},
pages = {1527--1554},
title = {A Fast Learning Algorithm for Deep Belief Nets},
volume = {18},
year = {2006}
}
@article{hu2024minicpm,
author = {Hu, Shengding and Tu, Yuge and Han, Xu and He, Chaoqun and Cui, Ganqu and Long, Xiang and Zheng, Zhi and Fang, Yewei and Huang, Yuxiang and Zhao, Weilin and others},
journal = {ArXiv preprint},
title = {Minicpm: Unveiling the potential of small language models with scalable training strategies},
url = {https://arxiv.org/abs/2404.06395},
volume = {abs/2404.06395},
year = {2024}
}
@article{jain2021mural,
author = {Jain, Aashi and Guo, Mandy and Srinivasan, Krishna and Chen, Ting and Kudugunta, Sneha and Jia, Chao and Yang, Yinfei and Baldridge, Jason},
journal = {ArXiv preprint},
title = {Mural: multimodal, multitask retrieval across languages},
url = {https://arxiv.org/abs/2109.05125},
volume = {abs/2109.05125},
year = {2021}
}
@inproceedings{joshi-etal-2020-state,
address = {Online},
author = {Joshi, Pratik and
Santy, Sebastin and
Budhiraja, Amar and
Bali, Kalika and
Choudhury, Monojit},
booktitle = {Proc. of ACL},
pages = {6282--6293},
publisher = {Association for Computational Linguistics},
title = {The State and Fate of Linguistic Diversity and Inclusion in the {NLP} World},
url = {https://aclanthology.org/2020.acl-main.560},
year = {2020}
}
@inproceedings{kim2022donut,
author = {Kim, Geewook and Hong, Teakgyu and Yim, Moonbin and Nam, JeongYeon and Park, Jinyoung and Yim, Jinyeong and Hwang, Wonseok and Yun, Sangdoo and Han, Dongyoon and Park, Seunghyun},
booktitle = {European Conference on Computer Vision (ECCV)},
title = {OCR-Free Document Understanding Transformer},
year = {2022}
}
@inproceedings{kim2023prometheus,
author = {Kim, Seungone and Shin, Jamin and Cho, Yejin and Jang, Joel and Longpre, Shayne and Lee, Hwaran and Yun, Sangdoo and Shin, Seongjin and Kim, Sungdong and Thorne, James and others},
booktitle = {The Twelfth International Conference on Learning Representations},
title = {Prometheus: Inducing fine-grained evaluation capability in language models},
year = {2023}
}
@article{kim2024biggen,
author = {Kim, Seungone and Suk, Juyoung and Cho, Ji Yong and Longpre, Shayne and Kim, Chaeeun and Yoon, Dongkeun and Son, Guijin and Cho, Yejin and Shafayat, Sheikh and Baek, Jinheon and others},
journal = {ArXiv preprint},
title = {The BiGGen Bench: A Principled Benchmark for Fine-grained Evaluation of Language Models with Language Models},
url = {https://arxiv.org/abs/2406.05761},
volume = {abs/2406.05761},
year = {2024}
}
@article{kim2024prometheus,
author = {Kim, Seungone and Suk, Juyoung and Longpre, Shayne and Lin, Bill Yuchen and Shin, Jamin and Welleck, Sean and Neubig, Graham and Lee, Moontae and Lee, Kyungjae and Seo, Minjoon},
journal = {ArXiv preprint},
title = {Prometheus 2: An open source language model specialized in evaluating other language models},
url = {https://arxiv.org/abs/2405.01535},
volume = {abs/2405.01535},
year = {2024}
}
@article{krishna2017visual,
author = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A and others},
journal = {International journal of computer vision},
pages = {32--73},
title = {Visual genome: Connecting language and vision using crowdsourced dense image annotations},
volume = {123},
year = {2017}
}
@inproceedings{lai2023okapi,
author = {Lai, Viet and Nguyen, Chien and Ngo, Nghia and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan and Nguyen, Thien},
booktitle = {Proc. of EMNLP},
pages = {318--327},
title = {Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback},
year = {2023}
}
@article{lee2024prometheusvision,
author = {Lee, Seongyun and Kim, Seungone and Park, Sue Hyun and Kim, Geewook and Seo, Minjoon},
journal = {ArXiv preprint},
title = {Prometheusvision: Vision-language model as a judge for fine-grained evaluation},
url = {https://arxiv.org/abs/2401.06591},
volume = {abs/2401.06591},
year = {2024}
}
@misc{lin2022fewshotlearningmultilinguallanguage,
author = {Xi Victoria Lin and Todor Mihaylov and Mikel Artetxe and Tianlu Wang and Shuohui Chen and Daniel Simig and Myle Ott and Naman Goyal and Shruti Bhosale and Jingfei Du and Ramakanth Pasunuru and Sam Shleifer and Punit Singh Koura and Vishrav Chaudhary and Brian O'Horo and Jeff Wang and Luke Zettlemoyer and Zornitsa Kozareva and Mona Diab and Veselin Stoyanov and Xian Li},
journal = {ArXiv preprint},
title = {Few-shot Learning with Multilingual Language Models},
url = {https://arxiv.org/abs/2112.10668},
volume = {abs/2112.10668},
year = {2021}
}
@inproceedings{liu2021visually,
address = {Online and Punta Cana, Dominican Republic},
author = {Liu, Fangyu and
Bugliarello, Emanuele and
Ponti, Edoardo Maria and
Reddy, Siva and
Collier, Nigel and
Elliott, Desmond},
booktitle = {Proc. of EMNLP},
pages = {10467--10485},
publisher = {Association for Computational Linguistics},
title = {Visually Grounded Reasoning across Languages and Cultures},
url = {https://aclanthology.org/2021.emnlp-main.818},
year = {2021}
}
@misc{liu2023improvedllava,
author = {Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
journal = {ArXiv preprint},
title = {Improved Baselines with Visual Instruction Tuning},
url = {https://arxiv.org/abs/2310.03744},
volume = {abs/2310.03744},
year = {2023}
}
@inproceedings{liu2023llava,
author = {Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
booktitle = {NeurIPS},
title = {Visual Instruction Tuning},
year = {2023}
}
@misc{liu2024llavanext,
author = {Liu, Haotian and Li, Chunyuan and Li, Yuheng and Li, Bo and Zhang, Yuanhan and Shen, Sheng and Lee, Yong Jae},
title = {LLaVA-NeXT: Improved reasoning, OCR, and world knowledge},
year = {2024}
}
@misc{LLaVA_JP_Instruct_108K,
author = {Toshi456},
note = {Accessed: 2024-10-01},
title = {LLaVA-JP-Instruct-108K Dataset},
year = {2023}
}
@misc{lmms_eval2024,
author = {Li, Bo and Zhang, Peiyuan and Zhang, Kaichen and Pu, Fanyi and Du, Xinrun and Dong,Yuhao and Liu, Haotian and Zhang, Yuanhan and Zhang, Ge and Li,Chunyuan and Liu, Ziwei},
title = {LMMs-Eval: Accelerating the Development of Large Multimoal Models},
version = {v0.1.0},
year = {2024}
}
@inproceedings{lumathvista,
author = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi, Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
booktitle = {The Twelfth International Conference on Learning Representations},
title = {MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts},
year = {2024}
}
@inproceedings{masry2022chartqa,
address = {Dublin, Ireland},
author = {Masry, Ahmed and
Do, Xuan Long and
Tan, Jia Qing and
Joty, Shafiq and
Hoque, Enamul},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
pages = {2263--2279},
publisher = {Association for Computational Linguistics},
title = {{C}hart{QA}: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning},
url = {https://aclanthology.org/2022.findings-acl.177},
year = {2022}
}
@misc{MMMLU,
author = {OpenAI},
note = {Accessed: 2024-10-01},
title = {MMMLU Dataset},
year = {2024}
}
@inproceedings{NEURIPS2023_d08b6801,
author = {Ramaswamy, Vikram V. and Lin, Sing Yu and Zhao, Dora and Adcock, Aaron and van der Maaten, Laurens and Ghadiyaram, Deepti and Russakovsky, Olga},
booktitle = {Advances in Neural Information Processing Systems},
editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
pages = {66127--66137},
title = {GeoDE: a Geographically Diverse Evaluation Dataset for Object Recognition},
volume = {36},
year = {2023}
}
@inproceedings{ni2021m3p,
author = {Minheng Ni and
Haoyang Huang and
Lin Su and
Edward Cui and
Taroon Bharti and
Lijuan Wang and
Dongdong Zhang and
Nan Duan},
biburl = {https://dblp.org/rec/conf/cvpr/NiHSCBW0D21.bib},
booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
2021, virtual, June 19-25, 2021},
pages = {3977--3986},
publisher = {Computer Vision Foundation / {IEEE}},
timestamp = {Mon, 18 Jul 2022 01:00:00 +0200},
title = {{M3P:} Learning Universal Representations via Multitask Multilingual
Multimodal Pre-Training},
url = {https://openaccess.thecvf.com/content/CVPR2021/html/Ni\_M3P\_Learning\_Universal\_Representations\_via\_Multitask\_Multilingual\_Multimodal\_Pre-Training\_CVPR\_2021\_paper.html},
year = {2021}
}
@article{nllb2024scaling,
author = {{NLLB Team}},
journal = {Nature},
number = {8018},
pages = {841},
title = {Scaling neural machine translation to 200 languages},
volume = {630},
year = {2024}
}
@misc{OpenHermes25,
author = {Teknium},
title = {OpenHermes 2.5: An Open Dataset of Synthetic Data for Generalist LLM Assistants},
year = {2023}
}
@inproceedings{PALO,
author = {Rasheed, Hanoona and Maaz, Muhammad and Shaker, Abdelrahman and Khan, Salman and Cholakal, Hisham and Anwer, Rao M. and Baldwin, Tim and Felsberg, Michael and Khan, Fahad S.},
booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV 2025)},
title = {Palo: A Large Multilingual Multimodal Language Model},
year = {2025}
}
@inproceedings{pfeiffer2021xgqa,
address = {Dublin, Ireland},
author = {Pfeiffer, Jonas and
Geigle, Gregor and
Kamath, Aishwarya and
Steitz, Jan-Martin and
Roth, Stefan and
Vuli{\'c}, Ivan and
Gurevych, Iryna},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
pages = {2497--2511},
publisher = {Association for Computational Linguistics},
title = {x{GQA}: Cross-Lingual Visual Question Answering},
url = {https://aclanthology.org/2022.findings-acl.196},
year = {2022}
}
@inproceedings{pfeiffer2022xgqa,
address = {Dublin, Ireland},
author = {Pfeiffer, Jonas and
Geigle, Gregor and
Kamath, Aishwarya and
Steitz, Jan-Martin and
Roth, Stefan and
Vuli{\'c}, Ivan and
Gurevych, Iryna},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
pages = {2497--2511},
publisher = {Association for Computational Linguistics},
title = {x{GQA}: Cross-Lingual Visual Question Answering},
url = {https://aclanthology.org/2022.findings-acl.196},
year = {2022}
}
@article{pfeiffer2023mmt5,
author = {Pfeiffer, Jonas and Piccinno, Francesco and Nicosia, Massimo and Wang, Xinyi and Reid, Machel and Ruder, Sebastian},
journal = {ArXiv preprint},
title = {mmt5: Modular multilingual pre-training solves source language hallucinations},
url = {https://arxiv.org/abs/2305.14224},
volume = {abs/2305.14224},
year = {2023}
}
@article{ramaswamy2024geode,
author = {Ramaswamy, Vikram V and Lin, Sing Yu and Zhao, Dora and Adcock, Aaron and van der Maaten, Laurens and Ghadiyaram, Deepti and Russakovsky, Olga},
journal = {Advances in Neural Information Processing Systems},
title = {Geode: a geographically diverse evaluation dataset for object recognition},
volume = {36},
year = {2024}
}
@inproceedings{reimers2019sentence,
address = {Hong Kong, China},
author = {Reimers, Nils and
Gurevych, Iryna},
booktitle = {Proc. of EMNLP},
pages = {3982--3992},
publisher = {Association for Computational Linguistics},
title = {Sentence-{BERT}: Sentence Embeddings using {S}iamese {BERT}-Networks},
url = {https://aclanthology.org/D19-1410},
year = {2019}
}
@article{romero2024cvqa,
author = {Romero, David and Lyu, Chenyang and Wibowo, Haryo Akbarianto and Lynn, Teresa and Hamed, Injy and Kishore, Aditya Nanda and Mandal, Aishik and Dragonetti, Alina and Abzaliev, Artem and Tonja, Atnafu Lambebo and others},
journal = {ArXiv preprint},
title = {CVQA: Culturally-diverse Multilingual Visual Question Answering Benchmark},
url = {https://arxiv.org/abs/2406.05967},
volume = {abs/2406.05967},
year = {2024}
}
@article{schuhmann2022laion,
author = {Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and others},
journal = {Advances in Neural Information Processing Systems},
pages = {25278--25294},
title = {Laion-5b: An open large-scale dataset for training next generation image-text models},
volume = {35},
year = {2022}
}
@article{shan2022ernie,
author = {Shan, Bin and Han, Yaqian and Yin, Weichong and Wang, Shuohuan and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
journal = {ArXiv preprint},
title = {Ernie-unix2: A unified cross-lingual cross-modal framework for understanding and generation},
url = {https://arxiv.org/abs/2211.04861},
volume = {abs/2211.04861},
year = {2022}
}
@misc{shi2022languagemodelsmultilingualchainofthought,
author = {Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
journal = {ArXiv preprint},
title = {Language Models are Multilingual Chain-of-Thought Reasoners},
url = {https://arxiv.org/abs/2210.03057},
volume = {abs/2210.03057},
year = {2022}
}
@inproceedings{shi2022mgsm,
author = {Shi, Freda and Suzgun, Mirac and Freitag, Markus and Wang, Xuezhi and Srivats, Suraj and Vosoughi, Soroush and Chung, Hyung Won and Tay, Yi and Ruder, Sebastian and Zhou, Denny and others},
booktitle = {The Eleventh International Conference on Learning Representations},
title = {Language models are multilingual chain-of-thought reasoners},
year = {2022}
}
@inproceedings{song-etal-2023-globalbench,
abstract = {Despite the major advances in NLP, significant disparities in NLP system performance across languages still exist. Arguably, these are due to uneven resource allocation and sub-optimal incentives to work on less resourced languages. To track and further incentivize the global development of equitable language technology, we introduce GlobalBench. Prior multilingual benchmarks are static and have focused on a limited number of tasks and languages. In contrast, GlobalBench is an ever-expanding collection that aims to dynamically track progress on all NLP datasets in all languages. Rather than solely measuring accuracy, GlobalBench also tracks the estimated per-speaker utility and equity of technology across all languages, providing a multi-faceted view of how language technology is serving people of the world. Furthermore, GlobalBench is designed to identify the most under-served languages, and rewards research efforts directed towards those languages. At present, the most under-served languages are the ones with a relatively high population, but nonetheless overlooked by composite multilingual benchmarks (like Punjabi, Portuguese, and Wu Chinese). Currently, GlobalBench covers 966 datasets in 190 languages, and has 1,128 system submissions spanning 62 languages.},
address = {Singapore},
author = {Song, Yueqi and
Khanuja, Simran and
Liu, Pengfei and
Faisal, Fahim and
Ostapenko, Alissa and
Winata, Genta and
Aji, Alham Fikri and
Cahyawijaya, Samuel and
Tsvetkov, Yulia and
Anastasopoulos, Antonios and
Neubig, Graham},
booktitle = {Proc. of EMNLP},
editor = {Bouamor, Houda and
Pino, Juan and
Bali, Kalika},
pages = {14157--14171},
publisher = {Association for Computational Linguistics},
title = {{G}lobal{B}ench: A Benchmark for Global Progress in Natural Language Processing},
url = {https://aclanthology.org/2023.emnlp-main.875},
year = {2023}
}
@article{tang2024mtvqa,
author = {Tang, Jingqun and Liu, Qi and Ye, Yongjie and Lu, Jinghui and Wei, Shu and Lin, Chunhui and Li, Wanqing and Mahmood, Mohamad Fitri Faiz Bin and Feng, Hao and Zhao, Zhen and others},
journal = {ArXiv preprint},
title = {MTVQA: Benchmarking Multilingual Text-Centric Visual Question Answering},
url = {https://arxiv.org/abs/2405.11985},
volume = {abs/2405.11985},
year = {2024}
}
@inproceedings{thapliyal2022crossmodal,
address = {Abu Dhabi, United Arab Emirates},
author = {Thapliyal, Ashish V. and
Pont Tuset, Jordi and
Chen, Xi and
Soricut, Radu},
booktitle = {Proc. of EMNLP},
pages = {715--729},
publisher = {Association for Computational Linguistics},
title = {Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset},
url = {https://aclanthology.org/2022.emnlp-main.45},
year = {2022}
}
@article{tong2024cambrian,
author = {Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others},
journal = {ArXiv preprint},
title = {Cambrian-1: A fully open, vision-centric exploration of multimodal llms},
url = {https://arxiv.org/abs/2406.16860},
volume = {abs/2406.16860},
year = {2024}
}
@online{wikidump,
author = {Wikimedia Foundation},
title = {Wikimedia Downloads}
}
@inproceedings{xue2020mt5,
address = {Online},
author = {Xue, Linting and
Constant, Noah and
Roberts, Adam and
Kale, Mihir and
Al-Rfou, Rami and
Siddhant, Aditya and
Barua, Aditya and
Raffel, Colin},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
pages = {483--498},
publisher = {Association for Computational Linguistics},
title = {m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer},
url = {https://aclanthology.org/2021.naacl-main.41},
year = {2021}
}
@article{yang2024qwen2,
author = {Yang, An and Yang, Baosong and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Zhou, Chang and Li, Chengpeng and Li, Chengyuan and Liu, Dayiheng and Huang, Fei and others},
journal = {ArXiv preprint},
title = {Qwen2 technical report},
url = {https://arxiv.org/abs/2407.10671},
volume = {abs/2407.10671},
year = {2024}
}
@article{ye2023flask,
author = {Ye, Seonghyeon and Kim, Doyoung and Kim, Sungdong and Hwang, Hyeonbin and Kim, Seungone and Jo, Yongrae and Thorne, James and Kim, Juho and Seo, Minjoon},
journal = {ArXiv preprint},
title = {Flask: Fine-grained language model evaluation based on alignment skill sets},
url = {https://arxiv.org/abs/2307.10928},
volume = {abs/2307.10928},
year = {2023}
}
@inproceedings{yoshikawa2017stair,
address = {Vancouver, Canada},
author = {Yoshikawa, Yuya and
Shigeto, Yutaro and
Takeuchi, Akikazu},
booktitle = {Proc. of ACL},
pages = {417--421},
publisher = {Association for Computational Linguistics},
title = {{STAIR} Captions: Constructing a Large-Scale {J}apanese Image Caption Dataset},
url = {https://aclanthology.org/P17-2066},
year = {2017}
}
@inproceedings{yu-etal-2022-beyond,
address = {Abu Dhabi, United Arab Emirates},
author = {Yu, Xinyan and
Chatterjee, Trina and
Asai, Akari and
Hu, Junjie and
Choi, Eunsol},
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2022},
pages = {3725--3743},
publisher = {Association for Computational Linguistics},
title = {Beyond Counting Datasets: A Survey of Multilingual Dataset Construction and Necessary Resources},
url = {https://aclanthology.org/2022.findings-emnlp.273},
year = {2022}
}
@inproceedings{yue2024mmmu,
author = {Yue, Xiang and Ni, Yuansheng and Zhang, Kai and Zheng, Tianyu and Liu, Ruoqi and Zhang, Ge and Stevens, Samuel and Jiang, Dongfu and Ren, Weiming and Sun, Yuxuan and others},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages = {9556--9567},
title = {Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi},
year = {2024}
}
@article{zeng2022cross,
author = {Zeng, Yan and Zhou, Wangchunshu and Luo, Ao and Cheng, Ziming and Zhang, Xinsong},
journal = {ArXiv preprint},
title = {Cross-view language modeling: Towards unified cross-lingual cross-modal pre-training},
url = {https://arxiv.org/abs/2206.00621},
volume = {abs/2206.00621},
year = {2022}
}
@article{zhang2023m3exam,
author = {Zhang, Wenxuan and Aljunied, Mahani and Gao, Chang and Chia, Yew Ken and Bing, Lidong},
journal = {Advances in Neural Information Processing Systems},
pages = {5484--5505},
title = {M3exam: A multilingual, multimodal, multilevel benchmark for examining large language models},
volume = {36},
year = {2023}
}
@article{zheng2023judging,
author = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric and others},
journal = {Advances in Neural Information Processing Systems},
pages = {46595--46623},
title = {Judging llm-as-a-judge with mt-bench and chatbot arena},
volume = {36},
year = {2023}
}
@misc{zheng2023judgingllmasajudgemtbenchchatbot,
author = {Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric P. Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
journal = {ArXiv preprint},
title = {Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena},
url = {https://arxiv.org/abs/2306.05685},
volume = {abs/2306.05685},
year = {2023}
}
@article{zheng2024opencodeinterpreter,
author = {Zheng, Tianyu and Zhang, Ge and Shen, Tianhao and Liu, Xueling and Lin, Bill Yuchen and Fu, Jie and Chen, Wenhu and Yue, Xiang},
journal = {ArXiv preprint},
title = {Opencodeinterpreter: Integrating code generation with execution and refinement},
url = {https://arxiv.org/abs/2402.14658},
volume = {abs/2402.14658},
year = {2024}
}
@misc{SoSoDocvqa,
author = {Loïc Sokoudjou Sonagu and Yoann Sola},
title = {DocVQA Dataset},
year = {2024},
url = {https://huggingface.co/datasets/cmarkea/doc-vqa},
keywords = {NLP, Multimodal}
}
@misc{AgDeTQA,
author = {Tom Agonnoude and Cyrile Delestre},
title = {Table VQA Dataset},
year = {2024},
url = {https://huggingface.co/datasets/cmarkea/table-vqa},
keywords = {NLP, Multimodal}
}
@misc{numina_math_datasets,
author = {Jia Li and Edward Beeching and Lewis Tunstall and Ben Lipkin and Roman Soletskyi and Shengyi Costa Huang and Kashif Rasul and Longhui Yu and Albert Jiang and Ziju Shen and Zihan Qin and Bin Dong and Li Zhou and Yann Fleureau and Guillaume Lample and Stanislas Polu},
title = {NuminaMath},
year = {2024},
publisher = {Numina},
journal = {Hugging Face repository},
howpublished = {\url{https://huggingface.co/AI-MO/NuminaMath-CoT}},
note = {\url{https://github.com/project-numina/aimo-progress-prize/blob/main/report/numina_dataset.pdf}}
}