bibliography.bib

@article{abdin2024phi3technicalreporthighly,
    author = {Abdin, Marah and Jacobs, Sam Ade and Awan, Ammar Ahmad and Aneja, Jyoti and Awadallah, Ahmed and Awadalla, Hany and Bach, Nguyen and Bahree, Amit and Bakhtiari, Arash and Behl, Harkirat and others},
    journal = {ArXiv preprint},
    title = {Phi-3 technical report: A highly capable language model locally on your phone},
    url = {https://arxiv.org/abs/2404.14219},
    volume = {abs/2404.14219},
    year = {2024}
}

@inproceedings{ahia-etal-2023-languages,
    abstract = {Language models have graduated from being research prototypes to commercialized products offered as web APIs, and recent works have highlighted the multilingual capabilities of these products. The API vendors charge their users based on usage, more specifically on the number of {``}tokens{''} processed or generated by the underlying language models. What constitutes a token, however, is training data and model dependent with a large variance in the number of tokens required to convey the same information in different languages. In this work, we analyze the effect of this non-uniformity on the fairness of an API{'}s pricing policy across languages. We conduct a systematic analysis of the cost and utility of OpenAI{'}s language model API on multilingual benchmarks in 22 typologically diverse languages. We show evidence that speakers of a large number of the supported languages are overcharged while obtaining poorer results. These speakers tend to also come from regions where the APIs are less affordable, to begin with. Through these analyses, we aim to increase transparency around language model APIs{'} pricing policies and encourage the vendors to make them more equitable.},
    author = {Ahia, Orevaoghene  and
Kumar, Sachin  and
Gonen, Hila  and
Kasai, Jungo  and
Mortensen, David  and
Smith, Noah  and
Tsvetkov, Yulia},
    booktitle = {Proc. of EMNLP},
    editor = {Bouamor, Houda  and
Pino, Juan  and
Bali, Kalika},
    pages = {9904--9923},
    title = {Do All Languages Cost the Same? Tokenization in the Era of Commercial Language Models},
    year = {2023}
}

@inproceedings{alkhamissi-etal-2024-investigating,
    abstract = {The intricate relationship between language and culture has long been a subject of exploration within the realm of linguistic anthropology. Large Language Models (LLMs), promoted as repositories of collective human knowledge, raise a pivotal question: do these models genuinely encapsulate the diverse knowledge adopted by different cultures? Our study reveals that these models demonstrate greater cultural alignment along two dimensions{---}firstly, when prompted with the dominant language of a specific culture, and secondly, when pretrained with a refined mixture of languages employed by that culture. We quantify cultural alignment by simulating sociological surveys, comparing model responses to those of actual survey participants as references. Specifically, we replicate a survey conducted in various regions of Egypt and the United States through prompting LLMs with different pretraining data mixtures in both Arabic and English with the personas of the real respondents and the survey questions. Further analysis reveals that misalignment becomes more pronounced for underrepresented personas and for culturally sensitive topics, such as those probing social values. Finally, we introduce Anthropological Prompting, a novel method leveraging anthropological reasoning to enhance cultural alignment. Our study emphasizes the necessity for a more balanced multilingual pretraining dataset to better represent the diversity of human experience and the plurality of different cultures with many implications on the topic of cross-lingual transfer.},
    author = {AlKhamissi, Badr  and
ElNokrashy, Muhammad  and
Alkhamissi, Mai  and
Diab, Mona},
    booktitle = {Proc. of ACL},
    editor = {Ku, Lun-Wei  and
Martins, Andre  and
Srikumar, Vivek},
    pages = {12404--12422},
    title = {Investigating Cultural Alignment of Large Language Models},
    year = {2024}
}

@incollection{Bengio+chapter2007,
    author = {Bengio, Yoshua and LeCun, Yann},
    booktitle = {Large Scale Kernel Machines},
    title = {Scaling Learning Algorithms Towards {AI}},
    year = {2007}
}

@article{beyer2024paligemma,
    author = {Lucas Beyer and Andreas Steiner and André Susano Pinto and Alexander Kolesnikov and Xiao Wang and Daniel Salz and Maxim Neumann and Ibrahim Alabdulmohsin and Michael Tschannen and Emanuele Bugliarello and Thomas Unterthiner and Daniel Keysers and Skanda Koppula and Fangyu Liu and Adam Grycner and Alexey Gritsenko and Neil Houlsby and Manoj Kumar and Keran Rong and Julian Eisenschlos and Rishabh Kabra and Matthias Bauer and Matko Bošnjak and Xi Chen and Matthias Minderer and Paul Voigtlaender and Ioana Bica and Ivana Balazevic and Joan Puigcerver and Pinelopi Papalampidi and Olivier Henaff and Xi Xiong and Radu Soricut and Jeremiah Harmsen and Xiaohua Zhai},
    journal = {ArXiv preprint},
    title = {{PaliGemma: A versatile 3B VLM for transfer}},
    url = {https://arxiv.org/abs/2407.07726},
    volume = {abs/2407.07726},
    year = {2024}
}

@article{biderman2024lessons,
    author = {Biderman, Stella and Schoelkopf, Hailey and Sutawika, Lintang and Gao, Leo and Tow, Jonathan and Abbasi, Baber and Aji, Alham Fikri and Ammanamanchi, Pawan Sasanka and Black, Sidney and Clive, Jordan and others},
    journal = {ArXiv preprint},
    title = {Lessons from the Trenches on Reproducible Evaluation of Language Models},
    url = {https://arxiv.org/abs/2405.14782},
    volume = {abs/2405.14782},
    year = {2024}
}

@inproceedings{blasi-etal-2022-systematic,
    address = {Dublin, Ireland},
    author = {Blasi, Damian  and
Anastasopoulos, Antonios  and
Neubig, Graham},
    booktitle = {Proc. of ACL},
    pages = {5486--5505},
    publisher = {Association for Computational Linguistics},
    title = {Systematic Inequalities in Language Technology Performance across the World{'}s Languages},
    url = {https://aclanthology.org/2022.acl-long.376},
    year = {2022}
}

@article{changpinyo2022maxm,
    author = {Changpinyo, Soravit and Xue, Linting and Yarom, Michal and Thapliyal, Ashish V and Szpektor, Idan and Amelot, Julien and Chen, Xi and Soricut, Radu},
    journal = {ArXiv preprint},
    title = {MaXM: Towards multilingual visual question answering},
    url = {https://arxiv.org/abs/2209.05401},
    volume = {abs/2209.05401},
    year = {2022}
}

@article{chen2023pali,
    author = {Chen, Xi and Djolonga, Josip and Padlewski, Piotr and Mustafa, Basil and Changpinyo, Soravit and Wu, Jialin and Ruiz, Carlos Riquelme and Goodman, Sebastian and Wang, Xiao and Tay, Yi and others},
    journal = {ArXiv preprint},
    title = {Pali-x: On scaling up a multilingual vision and language model},
    url = {https://arxiv.org/abs/2305.18565},
    volume = {abs/2305.18565},
    year = {2023}
}

@article{chen2024allava,
    author = {Chen, Guiming Hardy and Chen, Shunian and Zhang, Ruifei and Chen, Junying and Wu, Xiangbo and Zhang, Zhiyi and Chen, Zhihong and Li, Jianquan and Wan, Xiang and Wang, Benyou},
    journal = {ArXiv preprint},
    title = {Allava: Harnessing gpt4v-synthesized data for a lite vision-language model},
    url = {https://arxiv.org/abs/2402.11684},
    volume = {abs/2402.11684},
    year = {2024}
}

@misc{ChineseLLaVA,
    author = {LinkSoul-AI},
    note = {Accessed: 2024-10-01},
    title = {Chinese-LLaVA},
    year = {2023}
}

@misc{ChineseLLaVA_Med,
    author = {BUAA},
    note = {Accessed: 2024-10-01},
    title = {Chinese-LLaVA-Med},
    year = {2023}
}

@article{clark2020tydi,
    address = {Cambridge, MA},
    author = {Clark, Jonathan H.  and
Choi, Eunsol  and
Collins, Michael  and
Garrette, Dan  and
Kwiatkowski, Tom  and
Nikolaev, Vitaly  and
Palomaki, Jennimaria},
    journal = {Transactions of the Association for Computational Linguistics},
    pages = {454--470},
    publisher = {MIT Press},
    title = {{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages},
    url = {https://aclanthology.org/2020.tacl-1.30},
    volume = {8},
    year = {2020}
}

@article{deepmind_gemini_report,
    author = {Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Wu, Yonghui and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and others},
    journal = {ArXiv preprint},
    title = {Gemini: a family of highly capable multimodal models},
    url = {https://arxiv.org/abs/2312.11805},
    volume = {abs/2312.11805},
    year = {2023}
}

@misc{deepvk2024gqa_ru,
    author = {Belopolskih, Daniil and Spirin, Egor},
    title = {GQA-ru},
    year = {2024}
}

@article{deitke2024molmo,
    author = {Deitke, Matt and Clark, Christopher and Lee, Sangho and Tripathi, Rohun and Yang, Yue and Park, Jae Sung and Salehi, Mohammadreza and Muennighoff, Niklas and Lo, Kyle and Soldaini, Luca and others},
    journal = {ArXiv preprint},
    title = {Molmo and PixMo: Open Weights and Open Data for State-of-the-Art Multimodal Models},
    url = {https://arxiv.org/abs/2409.17146},
    volume = {abs/2409.17146},
    year = {2024}
}

@article{doan2024vintern,
    author = {Doan, Khang T and Huynh, Bao G and Hoang, Dung T and Pham, Thuc D and Pham, Nhat H and Nguyen, Quan and Vo, Bang Q and Hoang, Suong N},
    journal = {ArXiv preprint},
    title = {Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese},
    url = {https://arxiv.org/abs/2408.12480},
    volume = {abs/2408.12480},
    year = {2024}
}

@article{dubey2024llama,
    author = {Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
    journal = {ArXiv preprint},
    title = {The llama 3 herd of models},
    url = {https://arxiv.org/abs/2407.21783},
    volume = {abs/2407.21783},
    year = {2024}
}

@inproceedings{geigle_etal_2024_mblip,
    author = {Geigle, Gregor  and
Jain, Abhay  and
Timofte, Radu  and
Glava{\v{s}}, Goran},
    booktitle = {Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)},
    editor = {Gu, Jing  and
Fu, Tsu-Jui (Ray)  and
Hudson, Drew  and
Celikyilmaz, Asli  and
Wang, William},
    pages = {7--25},
    title = {m{BLIP}: Efficient Bootstrapping of Multilingual Vision-{LLM}s},
    year = {2024}
}

@inproceedings{goodfellow2016deep,
    author = {Ruslan Salakhutdinov},
    biburl = {https://dblp.org/rec/conf/kdd/Salakhutdinov14.bib},
    booktitle = {Proc. of KDD},
    editor = {Sofus A. Macskassy and
Claudia Perlich and
Jure Leskovec and
Wei Wang and
Rayid Ghani},
    pages = {1973},
    publisher = {{ACM}},
    timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
    title = {Deep learning},
    url = {https://doi.org/10.1145/2623330.2630809},
    year = {2014}
}

@misc{gpt4o,
    author = {OpenAI},
    key = {gpt-4o},
    title = {Hello gpt4-o. https://openai.com/index/hello-gpt-4o/},
    year = {2024}
}

@inproceedings{gupta2019lvis,
    author = {Agrim Gupta and
Piotr Doll{\'{a}}r and
Ross B. Girshick},
    biburl = {https://dblp.org/rec/conf/cvpr/GuptaDG19.bib},
    booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
2019, Long Beach, CA, USA, June 16-20, 2019},
    pages = {5356--5364},
    publisher = {Computer Vision Foundation / {IEEE}},
    timestamp = {Mon, 20 Jan 2020 00:00:00 +0100},
    title = {{LVIS:} {A} Dataset for Large Vocabulary Instance Segmentation},
    url = {http://openaccess.thecvf.com/content\_CVPR\_2019/html/Gupta\_LVIS\_A\_Dataset\_for\_Large\_Vocabulary\_Instance\_Segmentation\_CVPR\_2019\_paper.html},
    year = {2019}
}

@inproceedings{Hamborg2017,
    author = {Hamborg, Felix and Meuschke, Norman and Breitinger, Corinna and Gipp, Bela},
    booktitle = {Proceedings of the 15th International Symposium of Information Science},
    location = {Berlin},
    pages = {218--223},
    title = {news-please: A Generic News Crawler and Extractor},
    year = {2017}
}

@article{han2023reading,
    author = {Han, Seungju and Kim, Junhyeok and Hessel, Jack and Jiang, Liwei and Chung, Jiwan and Son, Yejin and Choi, Yejin and Yu, Youngjae},
    journal = {ArXiv preprint},
    title = {Reading Books is Great, But Not if You Are Driving! Visually Grounded Reasoning about Defeasible Commonsense Norms},
    url = {https://arxiv.org/abs/2310.10418},
    volume = {abs/2310.10418},
    year = {2023}
}

@inproceedings{hendrycksmeasuring2021,
    author = {Dan Hendrycks and
Collin Burns and
Steven Basart and
Andy Zou and
Mantas Mazeika and
Dawn Song and
Jacob Steinhardt},
    biburl = {https://dblp.org/rec/conf/iclr/HendrycksBBZMSS21.bib},
    booktitle = {Proc. of ICLR},
    publisher = {OpenReview.net},
    timestamp = {Wed, 23 Jun 2021 01:00:00 +0200},
    title = {Measuring Massive Multitask Language Understanding},
    url = {https://openreview.net/forum?id=d7KBjmI3GmQ},
    year = {2021}
}

@inproceedings{hessel2023androids,
    author = {Hessel, Jack and Marasovi{\'c}, Ana and Hwang, Jena D and Lee, Lillian and Da, Jeff and Zellers, Rowan and Mankoff, Robert and Choi, Yejin},
    booktitle = {Proc. of ACL},
    pages = {688--714},
    title = {Do Androids Laugh at Electric Sheep? Humor “Understanding” Benchmarks from The New Yorker Caption Contest},
    year = {2023}
}

@article{Hinton06,
    author = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee Whye},
    journal = {Neural Computation},
    pages = {1527--1554},
    title = {A Fast Learning Algorithm for Deep Belief Nets},
    volume = {18},
    year = {2006}
}

@article{hu2024minicpm,
    author = {Hu, Shengding and Tu, Yuge and Han, Xu and He, Chaoqun and Cui, Ganqu and Long, Xiang and Zheng, Zhi and Fang, Yewei and Huang, Yuxiang and Zhao, Weilin and others},
    journal = {ArXiv preprint},
    title = {Minicpm: Unveiling the potential of small language models with scalable training strategies},
    url = {https://arxiv.org/abs/2404.06395},
    volume = {abs/2404.06395},
    year = {2024}
}

@article{jain2021mural,
    author = {Jain, Aashi and Guo, Mandy and Srinivasan, Krishna and Chen, Ting and Kudugunta, Sneha and Jia, Chao and Yang, Yinfei and Baldridge, Jason},
    journal = {ArXiv preprint},
    title = {Mural: multimodal, multitask retrieval across languages},
    url = {https://arxiv.org/abs/2109.05125},
    volume = {abs/2109.05125},
    year = {2021}
}

@inproceedings{joshi-etal-2020-state,
    address = {Online},
    author = {Joshi, Pratik  and
Santy, Sebastin  and
Budhiraja, Amar  and
Bali, Kalika  and
Choudhury, Monojit},
    booktitle = {Proc. of ACL},
    pages = {6282--6293},
    publisher = {Association for Computational Linguistics},
    title = {The State and Fate of Linguistic Diversity and Inclusion in the {NLP} World},
    url = {https://aclanthology.org/2020.acl-main.560},
    year = {2020}
}

@inproceedings{kim2022donut,
    author = {Kim, Geewook and Hong, Teakgyu and Yim, Moonbin and Nam, JeongYeon and Park, Jinyoung and Yim, Jinyeong and Hwang, Wonseok and Yun, Sangdoo and Han, Dongyoon and Park, Seunghyun},
    booktitle = {European Conference on Computer Vision (ECCV)},
    title = {OCR-Free Document Understanding Transformer},
    year = {2022}
}

@inproceedings{kim2023prometheus,
    author = {Kim, Seungone and Shin, Jamin and Cho, Yejin and Jang, Joel and Longpre, Shayne and Lee, Hwaran and Yun, Sangdoo and Shin, Seongjin and Kim, Sungdong and Thorne, James and others},
    booktitle = {The Twelfth International Conference on Learning Representations},
    title = {Prometheus: Inducing fine-grained evaluation capability in language models},
    year = {2023}
}

@article{kim2024biggen,
    author = {Kim, Seungone and Suk, Juyoung and Cho, Ji Yong and Longpre, Shayne and Kim, Chaeeun and Yoon, Dongkeun and Son, Guijin and Cho, Yejin and Shafayat, Sheikh and Baek, Jinheon and others},
    journal = {ArXiv preprint},
    title = {The BiGGen Bench: A Principled Benchmark for Fine-grained Evaluation of Language Models with Language Models},
    url = {https://arxiv.org/abs/2406.05761},
    volume = {abs/2406.05761},
    year = {2024}
}

@article{kim2024prometheus,
    author = {Kim, Seungone and Suk, Juyoung and Longpre, Shayne and Lin, Bill Yuchen and Shin, Jamin and Welleck, Sean and Neubig, Graham and Lee, Moontae and Lee, Kyungjae and Seo, Minjoon},
    journal = {ArXiv preprint},
    title = {Prometheus 2: An open source language model specialized in evaluating other language models},
    url = {https://arxiv.org/abs/2405.01535},
    volume = {abs/2405.01535},
    year = {2024}
}

@article{krishna2017visual,
    author = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A and others},
    journal = {International journal of computer vision},
    pages = {32--73},
    title = {Visual genome: Connecting language and vision using crowdsourced dense image annotations},
    volume = {123},
    year = {2017}
}

@inproceedings{lai2023okapi,
    author = {Lai, Viet and Nguyen, Chien and Ngo, Nghia and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan and Nguyen, Thien},
    booktitle = {Proc. of EMNLP},
    pages = {318--327},
    title = {Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback},
    year = {2023}
}

@article{lee2024prometheusvision,
    author = {Lee, Seongyun and Kim, Seungone and Park, Sue Hyun and Kim, Geewook and Seo, Minjoon},
    journal = {ArXiv preprint},
    title = {Prometheusvision: Vision-language model as a judge for fine-grained evaluation},
    url = {https://arxiv.org/abs/2401.06591},
    volume = {abs/2401.06591},
    year = {2024}
}

@misc{lin2022fewshotlearningmultilinguallanguage,
    author = {Xi Victoria Lin and Todor Mihaylov and Mikel Artetxe and Tianlu Wang and Shuohui Chen and Daniel Simig and Myle Ott and Naman Goyal and Shruti Bhosale and Jingfei Du and Ramakanth Pasunuru and Sam Shleifer and Punit Singh Koura and Vishrav Chaudhary and Brian O'Horo and Jeff Wang and Luke Zettlemoyer and Zornitsa Kozareva and Mona Diab and Veselin Stoyanov and Xian Li},
    journal = {ArXiv preprint},
    title = {Few-shot Learning with Multilingual Language Models},
    url = {https://arxiv.org/abs/2112.10668},
    volume = {abs/2112.10668},
    year = {2021}
}

@inproceedings{liu2021visually,
    address = {Online and Punta Cana, Dominican Republic},
    author = {Liu, Fangyu  and
Bugliarello, Emanuele  and
Ponti, Edoardo Maria  and
Reddy, Siva  and
Collier, Nigel  and
Elliott, Desmond},
    booktitle = {Proc. of EMNLP},
    pages = {10467--10485},
    publisher = {Association for Computational Linguistics},
    title = {Visually Grounded Reasoning across Languages and Cultures},
    url = {https://aclanthology.org/2021.emnlp-main.818},
    year = {2021}
}

@misc{liu2023improvedllava,
    author = {Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
    journal = {ArXiv preprint},
    title = {Improved Baselines with Visual Instruction Tuning},
    url = {https://arxiv.org/abs/2310.03744},
    volume = {abs/2310.03744},
    year = {2023}
}

@inproceedings{liu2023llava,
    author = {Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
    booktitle = {NeurIPS},
    title = {Visual Instruction Tuning},
    year = {2023}
}

@misc{liu2024llavanext,
    author = {Liu, Haotian and Li, Chunyuan and Li, Yuheng and Li, Bo and Zhang, Yuanhan and Shen, Sheng and Lee, Yong Jae},
    title = {LLaVA-NeXT: Improved reasoning, OCR, and world knowledge},
    year = {2024}
}

@misc{LLaVA_JP_Instruct_108K,
    author = {Toshi456},
    note = {Accessed: 2024-10-01},
    title = {LLaVA-JP-Instruct-108K Dataset},
    year = {2023}
}

@misc{lmms_eval2024,
    author = {Li, Bo and Zhang, Peiyuan and Zhang, Kaichen and Pu, Fanyi and Du, Xinrun and Dong,Yuhao and Liu, Haotian and Zhang, Yuanhan and Zhang, Ge and Li,Chunyuan and Liu, Ziwei},
    title = {LMMs-Eval: Accelerating the Development of Large Multimoal Models},
    version = {v0.1.0},
    year = {2024}
}

@inproceedings{lumathvista,
    author = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi, Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
    booktitle = {The Twelfth International Conference on Learning Representations},
    title = {MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts},
    year = {2024}
}

@inproceedings{masry2022chartqa,
    address = {Dublin, Ireland},
    author = {Masry, Ahmed  and
Do, Xuan Long  and
Tan, Jia Qing  and
Joty, Shafiq  and
Hoque, Enamul},
    booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
    pages = {2263--2279},
    publisher = {Association for Computational Linguistics},
    title = {{C}hart{QA}: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning},
    url = {https://aclanthology.org/2022.findings-acl.177},
    year = {2022}
}

@misc{MMMLU,
    author = {OpenAI},
    note = {Accessed: 2024-10-01},
    title = {MMMLU Dataset},
    year = {2024}
}

@inproceedings{NEURIPS2023_d08b6801,
    author = {Ramaswamy, Vikram V. and Lin, Sing Yu and Zhao, Dora and Adcock, Aaron and van der Maaten, Laurens and Ghadiyaram, Deepti and Russakovsky, Olga},
    booktitle = {Advances in Neural Information Processing Systems},
    editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
    pages = {66127--66137},
    title = {GeoDE: a Geographically Diverse Evaluation Dataset for Object Recognition},
    volume = {36},
    year = {2023}
}

@inproceedings{ni2021m3p,
    author = {Minheng Ni and
Haoyang Huang and
Lin Su and
Edward Cui and
Taroon Bharti and
Lijuan Wang and
Dongdong Zhang and
Nan Duan},
    biburl = {https://dblp.org/rec/conf/cvpr/NiHSCBW0D21.bib},
    booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
2021, virtual, June 19-25, 2021},
    pages = {3977--3986},
    publisher = {Computer Vision Foundation / {IEEE}},
    timestamp = {Mon, 18 Jul 2022 01:00:00 +0200},
    title = {{M3P:} Learning Universal Representations via Multitask Multilingual
Multimodal Pre-Training},
    url = {https://openaccess.thecvf.com/content/CVPR2021/html/Ni\_M3P\_Learning\_Universal\_Representations\_via\_Multitask\_Multilingual\_Multimodal\_Pre-Training\_CVPR\_2021\_paper.html},
    year = {2021}
}

@article{nllb2024scaling,
    author = {{NLLB Team}},
    journal = {Nature},
    number = {8018},
    pages = {841},
    title = {Scaling neural machine translation to 200 languages},
    volume = {630},
    year = {2024}
}

@misc{OpenHermes25,
    author = {Teknium},
    title = {OpenHermes 2.5: An Open Dataset of Synthetic Data for Generalist LLM Assistants},
    year = {2023}
}

@inproceedings{PALO,
    author = {Rasheed, Hanoona and Maaz, Muhammad and Shaker, Abdelrahman and Khan, Salman and Cholakal, Hisham and Anwer, Rao M. and Baldwin, Tim and Felsberg, Michael and Khan, Fahad S.},
    booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV 2025)},
    title = {Palo: A Large Multilingual Multimodal Language Model},
    year = {2025}
}

@inproceedings{pfeiffer2021xgqa,
    address = {Dublin, Ireland},
    author = {Pfeiffer, Jonas  and
Geigle, Gregor  and
Kamath, Aishwarya  and
Steitz, Jan-Martin  and
Roth, Stefan  and
Vuli{\'c}, Ivan  and
Gurevych, Iryna},
    booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
    pages = {2497--2511},
    publisher = {Association for Computational Linguistics},
    title = {x{GQA}: Cross-Lingual Visual Question Answering},
    url = {https://aclanthology.org/2022.findings-acl.196},
    year = {2022}
}

@inproceedings{pfeiffer2022xgqa,
    address = {Dublin, Ireland},
    author = {Pfeiffer, Jonas  and
Geigle, Gregor  and
Kamath, Aishwarya  and
Steitz, Jan-Martin  and
Roth, Stefan  and
Vuli{\'c}, Ivan  and
Gurevych, Iryna},
    booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
    pages = {2497--2511},
    publisher = {Association for Computational Linguistics},
    title = {x{GQA}: Cross-Lingual Visual Question Answering},
    url = {https://aclanthology.org/2022.findings-acl.196},
    year = {2022}
}

@article{pfeiffer2023mmt5,
    author = {Pfeiffer, Jonas and Piccinno, Francesco and Nicosia, Massimo and Wang, Xinyi and Reid, Machel and Ruder, Sebastian},
    journal = {ArXiv preprint},
    title = {mmt5: Modular multilingual pre-training solves source language hallucinations},
    url = {https://arxiv.org/abs/2305.14224},
    volume = {abs/2305.14224},
    year = {2023}
}

@article{ramaswamy2024geode,
    author = {Ramaswamy, Vikram V and Lin, Sing Yu and Zhao, Dora and Adcock, Aaron and van der Maaten, Laurens and Ghadiyaram, Deepti and Russakovsky, Olga},
    journal = {Advances in Neural Information Processing Systems},
    title = {Geode: a geographically diverse evaluation dataset for object recognition},
    volume = {36},
    year = {2024}
}

@inproceedings{reimers2019sentence,
    address = {Hong Kong, China},
    author = {Reimers, Nils  and
Gurevych, Iryna},
    booktitle = {Proc. of EMNLP},
    pages = {3982--3992},
    publisher = {Association for Computational Linguistics},
    title = {Sentence-{BERT}: Sentence Embeddings using {S}iamese {BERT}-Networks},
    url = {https://aclanthology.org/D19-1410},
    year = {2019}
}

@article{romero2024cvqa,
    author = {Romero, David and Lyu, Chenyang and Wibowo, Haryo Akbarianto and Lynn, Teresa and Hamed, Injy and Kishore, Aditya Nanda and Mandal, Aishik and Dragonetti, Alina and Abzaliev, Artem and Tonja, Atnafu Lambebo and others},
    journal = {ArXiv preprint},
    title = {CVQA: Culturally-diverse Multilingual Visual Question Answering Benchmark},
    url = {https://arxiv.org/abs/2406.05967},
    volume = {abs/2406.05967},
    year = {2024}
}

@article{schuhmann2022laion,
    author = {Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and others},
    journal = {Advances in Neural Information Processing Systems},
    pages = {25278--25294},
    title = {Laion-5b: An open large-scale dataset for training next generation image-text models},
    volume = {35},
    year = {2022}
}

@article{shan2022ernie,
    author = {Shan, Bin and Han, Yaqian and Yin, Weichong and Wang, Shuohuan and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
    journal = {ArXiv preprint},
    title = {Ernie-unix2: A unified cross-lingual cross-modal framework for understanding and generation},
    url = {https://arxiv.org/abs/2211.04861},
    volume = {abs/2211.04861},
    year = {2022}
}

@misc{shi2022languagemodelsmultilingualchainofthought,
    author = {Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
    journal = {ArXiv preprint},
    title = {Language Models are Multilingual Chain-of-Thought Reasoners},
    url = {https://arxiv.org/abs/2210.03057},
    volume = {abs/2210.03057},
    year = {2022}
}

@inproceedings{shi2022mgsm,
    author = {Shi, Freda and Suzgun, Mirac and Freitag, Markus and Wang, Xuezhi and Srivats, Suraj and Vosoughi, Soroush and Chung, Hyung Won and Tay, Yi and Ruder, Sebastian and Zhou, Denny and others},
    booktitle = {The Eleventh International Conference on Learning Representations},
    title = {Language models are multilingual chain-of-thought reasoners},
    year = {2022}
}

@inproceedings{song-etal-2023-globalbench,
    abstract = {Despite the major advances in NLP, significant disparities in NLP system performance across languages still exist. Arguably, these are due to uneven resource allocation and sub-optimal incentives to work on less resourced languages. To track and further incentivize the global development of equitable language technology, we introduce GlobalBench. Prior multilingual benchmarks are static and have focused on a limited number of tasks and languages. In contrast, GlobalBench is an ever-expanding collection that aims to dynamically track progress on all NLP datasets in all languages. Rather than solely measuring accuracy, GlobalBench also tracks the estimated per-speaker utility and equity of technology across all languages, providing a multi-faceted view of how language technology is serving people of the world. Furthermore, GlobalBench is designed to identify the most under-served languages, and rewards research efforts directed towards those languages. At present, the most under-served languages are the ones with a relatively high population, but nonetheless overlooked by composite multilingual benchmarks (like Punjabi, Portuguese, and Wu Chinese). Currently, GlobalBench covers 966 datasets in 190 languages, and has 1,128 system submissions spanning 62 languages.},
    address = {Singapore},
    author = {Song, Yueqi  and
Khanuja, Simran  and
Liu, Pengfei  and
Faisal, Fahim  and
Ostapenko, Alissa  and
Winata, Genta  and
Aji, Alham Fikri  and
Cahyawijaya, Samuel  and
Tsvetkov, Yulia  and
Anastasopoulos, Antonios  and
Neubig, Graham},
    booktitle = {Proc. of EMNLP},
    editor = {Bouamor, Houda  and
Pino, Juan  and
Bali, Kalika},
    pages = {14157--14171},
    publisher = {Association for Computational Linguistics},
    title = {{G}lobal{B}ench: A Benchmark for Global Progress in Natural Language Processing},
    url = {https://aclanthology.org/2023.emnlp-main.875},
    year = {2023}
}

@article{tang2024mtvqa,
    author = {Tang, Jingqun and Liu, Qi and Ye, Yongjie and Lu, Jinghui and Wei, Shu and Lin, Chunhui and Li, Wanqing and Mahmood, Mohamad Fitri Faiz Bin and Feng, Hao and Zhao, Zhen and others},
    journal = {ArXiv preprint},
    title = {MTVQA: Benchmarking Multilingual Text-Centric Visual Question Answering},
    url = {https://arxiv.org/abs/2405.11985},
    volume = {abs/2405.11985},
    year = {2024}
}

@inproceedings{thapliyal2022crossmodal,
    address = {Abu Dhabi, United Arab Emirates},
    author = {Thapliyal, Ashish V.  and
Pont Tuset, Jordi  and
Chen, Xi  and
Soricut, Radu},
    booktitle = {Proc. of EMNLP},
    pages = {715--729},
    publisher = {Association for Computational Linguistics},
    title = {Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset},
    url = {https://aclanthology.org/2022.emnlp-main.45},
    year = {2022}
}

@article{tong2024cambrian,
    author = {Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others},
    journal = {ArXiv preprint},
    title = {Cambrian-1: A fully open, vision-centric exploration of multimodal llms},
    url = {https://arxiv.org/abs/2406.16860},
    volume = {abs/2406.16860},
    year = {2024}
}

@online{wikidump,
    author = {Wikimedia Foundation},
    title = {Wikimedia Downloads}
}

@inproceedings{xue2020mt5,
    address = {Online},
    author = {Xue, Linting  and
Constant, Noah  and
Roberts, Adam  and
Kale, Mihir  and
Al-Rfou, Rami  and
Siddhant, Aditya  and
Barua, Aditya  and
Raffel, Colin},
    booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
    pages = {483--498},
    publisher = {Association for Computational Linguistics},
    title = {m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer},
    url = {https://aclanthology.org/2021.naacl-main.41},
    year = {2021}
}

@article{yang2024qwen2,
    author = {Yang, An and Yang, Baosong and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Zhou, Chang and Li, Chengpeng and Li, Chengyuan and Liu, Dayiheng and Huang, Fei and others},
    journal = {ArXiv preprint},
    title = {Qwen2 technical report},
    url = {https://arxiv.org/abs/2407.10671},
    volume = {abs/2407.10671},
    year = {2024}
}

@article{ye2023flask,
    author = {Ye, Seonghyeon and Kim, Doyoung and Kim, Sungdong and Hwang, Hyeonbin and Kim, Seungone and Jo, Yongrae and Thorne, James and Kim, Juho and Seo, Minjoon},
    journal = {ArXiv preprint},
    title = {Flask: Fine-grained language model evaluation based on alignment skill sets},
    url = {https://arxiv.org/abs/2307.10928},
    volume = {abs/2307.10928},
    year = {2023}
}

@inproceedings{yoshikawa2017stair,
    address = {Vancouver, Canada},
    author = {Yoshikawa, Yuya  and
Shigeto, Yutaro  and
Takeuchi, Akikazu},
    booktitle = {Proc. of ACL},
    pages = {417--421},
    publisher = {Association for Computational Linguistics},
    title = {{STAIR} Captions: Constructing a Large-Scale {J}apanese Image Caption Dataset},
    url = {https://aclanthology.org/P17-2066},
    year = {2017}
}

@inproceedings{yu-etal-2022-beyond,
    address = {Abu Dhabi, United Arab Emirates},
    author = {Yu, Xinyan  and
Chatterjee, Trina  and
Asai, Akari  and
Hu, Junjie  and
Choi, Eunsol},
    booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2022},
    pages = {3725--3743},
    publisher = {Association for Computational Linguistics},
    title = {Beyond Counting Datasets: A Survey of Multilingual Dataset Construction and Necessary Resources},
    url = {https://aclanthology.org/2022.findings-emnlp.273},
    year = {2022}
}

@inproceedings{yue2024mmmu,
    author = {Yue, Xiang and Ni, Yuansheng and Zhang, Kai and Zheng, Tianyu and Liu, Ruoqi and Zhang, Ge and Stevens, Samuel and Jiang, Dongfu and Ren, Weiming and Sun, Yuxuan and others},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages = {9556--9567},
    title = {Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi},
    year = {2024}
}

@article{zeng2022cross,
    author = {Zeng, Yan and Zhou, Wangchunshu and Luo, Ao and Cheng, Ziming and Zhang, Xinsong},
    journal = {ArXiv preprint},
    title = {Cross-view language modeling: Towards unified cross-lingual cross-modal pre-training},
    url = {https://arxiv.org/abs/2206.00621},
    volume = {abs/2206.00621},
    year = {2022}
}

@article{zhang2023m3exam,
    author = {Zhang, Wenxuan and Aljunied, Mahani and Gao, Chang and Chia, Yew Ken and Bing, Lidong},
    journal = {Advances in Neural Information Processing Systems},
    pages = {5484--5505},
    title = {M3exam: A multilingual, multimodal, multilevel benchmark for examining large language models},
    volume = {36},
    year = {2023}
}

@article{zheng2023judging,
    author = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric and others},
    journal = {Advances in Neural Information Processing Systems},
    pages = {46595--46623},
    title = {Judging llm-as-a-judge with mt-bench and chatbot arena},
    volume = {36},
    year = {2023}
}

@misc{zheng2023judgingllmasajudgemtbenchchatbot,
    author = {Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric P. Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
    journal = {ArXiv preprint},
    title = {Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena},
    url = {https://arxiv.org/abs/2306.05685},
    volume = {abs/2306.05685},
    year = {2023}
}

@article{zheng2024opencodeinterpreter,
    author = {Zheng, Tianyu and Zhang, Ge and Shen, Tianhao and Liu, Xueling and Lin, Bill Yuchen and Fu, Jie and Chen, Wenhu and Yue, Xiang},
    journal = {ArXiv preprint},
    title = {Opencodeinterpreter: Integrating code generation with execution and refinement},
    url = {https://arxiv.org/abs/2402.14658},
    volume = {abs/2402.14658},
    year = {2024}
}


@misc{SoSoDocvqa,
  author    = {Loïc Sokoudjou Sonagu and Yoann Sola},
  title     = {DocVQA Dataset},
  year      = {2024},
  url       = {https://huggingface.co/datasets/cmarkea/doc-vqa},
  keywords  = {NLP, Multimodal}
}

@misc{AgDeTQA,
  author    = {Tom Agonnoude and Cyrile Delestre},
  title     = {Table VQA Dataset},
  year      = {2024},
  url       = {https://huggingface.co/datasets/cmarkea/table-vqa},
  keywords  = {NLP, Multimodal}
}

@misc{numina_math_datasets,
  author    = {Jia Li and Edward Beeching and Lewis Tunstall and Ben Lipkin and Roman Soletskyi and Shengyi Costa Huang and Kashif Rasul and Longhui Yu and Albert Jiang and Ziju Shen and Zihan Qin and Bin Dong and Li Zhou and Yann Fleureau and Guillaume Lample and Stanislas Polu},
  title     = {NuminaMath},
  year      = {2024},
  publisher = {Numina},
  journal   = {Hugging Face repository},
  howpublished = {\url{https://huggingface.co/AI-MO/NuminaMath-CoT}},
  note      = {\url{https://github.com/project-numina/aimo-progress-prize/blob/main/report/numina_dataset.pdf}}
}