-- -- Copyright (c) 2021-2025 Zeping Lee -- Released under the MIT license. -- Repository: https://github.com/zepinglee/citeproc-lua -- local bibtex2csl = {} local uni_utf8 local bibtex_parser local bibtex_data local journal_data local latex_parser local unicode local util local using_luatex, _ = pcall(require, "kpse") if using_luatex then uni_utf8 = require("unicode").utf8 bibtex_parser = require("citeproc-bibtex-parser") bibtex_data = require("citeproc-bibtex-data") unicode = require("citeproc-unicode") util = require("citeproc-util") else uni_utf8 = require("lua-utf8") bibtex_parser = require("citeproc.bibtex-parser") bibtex_data = require("citeproc.bibtex-data") unicode = require("citeproc.unicode") util = require("citeproc.util") end ---@alias CslItem ItemData ---@alias CslData CslItem[] ---Parse BibTeX content and convert to CSL-JSON ---@param str string ---@param keep_unknown_commands boolean? Keep unknown latex markups in . ---@param case_protection boolean? Add case-protection to braces as in BibTeX. ---@param sentence_case_title boolean? Convert `title` and `booktitle` to sentence case. ---@param check_sentence_case boolean? Check titles that are already sentence cased and do not conver them. ---@return CslData?, Exception[] function bibtex2csl.parse_bibtex_to_csl(str, keep_unknown_commands, case_protection, sentence_case_title, check_sentence_case) local strings = {} for name, macro in pairs(bibtex_data.macros) do strings[name] = macro.value end local bib_data, exceptions = bibtex_parser.parse(str, strings) local csl_json_items = nil if bib_data then -- TODO: Ideally we should load all .bib files and then resolve crossrefs and related bibtex_parser.resolve_crossrefs(bib_data.entries, bibtex_data.entries_by_id) csl_json_items = bibtex2csl.convert_to_csl_data(bib_data, keep_unknown_commands, case_protection, sentence_case_title, check_sentence_case) end return csl_json_items, exceptions end ---@param bib BibtexData ---@param keep_unknown_commands boolean? ---@param case_protection boolean? ---@param sentence_case_title boolean? ---@param check_sentence_case boolean? ---@return CslData function bibtex2csl.convert_to_csl_data(bib, keep_unknown_commands, case_protection, sentence_case_title, check_sentence_case) local csl_data = {} -- BibTeX looks for crossref in a case-insensitive manner. local bib_entry_dict = {} local csl_item_dict = {} for _, entry in ipairs(bib.entries) do bib_entry_dict[unicode.casefold(entry.key)] = entry local item = bibtex2csl.convert_to_csl_item(entry, keep_unknown_commands, case_protection, sentence_case_title, check_sentence_case) table.insert(csl_data, item) csl_item_dict[item.id] = item end bibtex2csl.resolve_related(csl_item_dict, bib_entry_dict) return csl_data end ---@param entry BibtexEntry ---@param keep_unknown_commands boolean? ---@param case_protection boolean? ---@param sentence_case_title boolean? ---@param check_sentence_case boolean? ---@return CslItem function bibtex2csl.convert_to_csl_item(entry, keep_unknown_commands, case_protection, sentence_case_title, check_sentence_case, disable_journal_abbreviation) ---@type CslItem local item = { id = entry.key, type = "document", } bibtex2csl.pre_process_special_fields(item, entry) -- First convert primary fields for field, csl_field in pairs(bibtex_data.primary_fields) do local value = entry.fields[field] if value then local _, csl_value = bibtex2csl.convert_field( field, value, keep_unknown_commands, case_protection, sentence_case_title, item.language, check_sentence_case) if csl_field and csl_value and not item[csl_field] then item[csl_field] = csl_value end end end -- Convert the fields in a fixed order local field_list = {} for field, _ in pairs(entry.fields) do table.insert(field_list, field) end table.sort(field_list) for _, field in ipairs(field_list) do local value = entry.fields[field] local csl_field, csl_value = bibtex2csl.convert_field( field, value, keep_unknown_commands, case_protection, sentence_case_title, item.language, check_sentence_case) if csl_field and csl_value and not item[csl_field] then item[csl_field] = csl_value end end bibtex2csl.post_process_special_fields(item, entry, disable_journal_abbreviation) return item end ---@param item CslItem ---@param entry BibtexEntry function bibtex2csl.pre_process_special_fields(item, entry) -- CSL types local type_data = bibtex_data.types[entry.type] if type_data and type_data.csl then item.type = type_data.csl elseif entry.fields.url then item.type = "webpage" end -- BibTeX's `edition` is expected to be an ordinal. if entry.fields.edition then item.edition = util.convert_ordinal_to_arabic(entry.fields.edition) end -- language: convert `babel` language to ISO 639-1 language code local lang = entry.fields.langid or entry.fields.language if lang then item.language = bibtex_data.language_code_map[unicode.casefold(lang)] end -- if not item.language then -- if util.has_cjk_char(item.title) then -- item.language = "zh" -- end -- end -- Merge title, maintitle, subtitle, titleaddon bibtex2csl.process_titles(entry) -- biblatex-apa if entry.fields.howpublished then local howpublished = string.lower(entry.fields.howpublished) if howpublished == "advance online publication" then -- TODO: get_locale_term("advance online publication" ) item.status = "Advance online publication" elseif howpublished == "manunpub" then -- biblatex-apa item.genre = "Unpublished manuscript" elseif howpublished == "maninprep" then -- biblatex-apa item.genre = "Manuscript in preparation" elseif howpublished == "mansub" then -- biblatex-apa item.genre = "Manuscript submitted for publication" end end if entry.fields.pubstate and string.lower(entry.fields.pubstate) == "inpress" then -- TODO: get_locale_term("advance online publication" ) item.status = "in press" end end ---@param entry BibtexEntry function bibtex2csl.process_titles(entry) local fields = entry.fields -- title and subtitle if fields.subtitle then if fields.title then fields.title = util.join_title(fields.title, fields.subtitle) if not fields.shorttitle then fields.shorttitle = fields.title end else fields.title = fields.subtitle end fields.subtitle = nil end -- booktitle and booksubtitle if fields.booksubtitle then if not fields["container-title-short"] then fields["container-title-short"] = fields.booktitle end if fields.booktitle then fields.booktitle = util.join_title(fields.booktitle, fields.booksubtitle) else fields.booktitle = fields.booksubtitle end fields.booksubtitle = nil end -- mainsubtitle if fields.mainsubtitle then if fields.maintitle then fields.maintitle = util.join_title(fields.maintitle, fields.mainsubtitle) else fields.maintitle = fields.mainsubtitle end end -- maintitle if fields.maintitle then if entry.type == "audio" or entry.type == "video" then -- maintitle is the container-title fields["container-title"] = fields.maintitle elseif fields.booktitle then -- maintitle is with booktitle if not fields["volume-title"] then fields["volume-title"] = fields.booktitle fields.booktitle = fields.maintitle end else -- maintitle is with title if fields.title then if not fields["volume-title"] then fields["volume-title"] = fields.title fields.title = fields.maintitle end else -- This is unlikelu to happen. fields.title = fields.maintitle end end end if fields.journalsubtitle then if fields.journaltitle then fields.journaltitle = util.join_title(fields.journaltitle, fields.journalsubtitle) elseif fields.journal then fields.journal = util.join_title(fields.journal, fields.journal) end fields.journalsubtitle = nil end if fields.issuesubtitle then if fields.issuetitle then fields.issuetitle = util.join_title(fields.issuetitle, fields.issuesubtitle) else fields.issuetitle = fields.issuesubtitle end fields.issuesubtitle = nil end end ---Convert BibTeX field to CSL field ---@param bib_field string ---@param value string ---@param keep_unknown_commands boolean? ---@param case_protection boolean? ---@param sentence_case_title boolean? ---@param language string? ---@param check_sentence_case boolean? ---@return string? csl_field ---@return string | table | number? csl_value function bibtex2csl.convert_field(bib_field, value, keep_unknown_commands, case_protection, sentence_case_title, language, check_sentence_case) local field_data = bibtex_data.fields[bib_field] if not field_data then return nil, nil end local csl_field = field_data.csl if not (csl_field and type(csl_field) == "string") then return nil, nil end if using_luatex then latex_parser = latex_parser or require("citeproc-latex-parser") else latex_parser = latex_parser or require("citeproc.latex-parser") end local field_type = field_data.type local csl_value if field_type == "name" then -- 1. unicode 2. prify (remove LaTeX markups) 3. plain text 4. split name parts value = latex_parser.latex_to_unicode(value) local names = bibtex_parser.split_names(value) csl_value = {} for i, name_str in ipairs(names) do local name_dict = bibtex_parser.split_name_parts(name_str) csl_value[i] = bibtex2csl.convert_to_csl_name(name_dict) end elseif field_type == "date" then if string.match(value, "\\") then -- "{\noopsort{1973c}}1981" value = latex_parser.latex_to_pseudo_html(value, false, false) end -- "-3000~" should not be converted to "-3000 " csl_value = util.parse_edtf(value) elseif bib_field == "title" or bib_field == "shorttitle" or bib_field == "booktitle" or bib_field == "container-title-short" then -- 1. unicode 2. sentence case 3. html tag if sentence_case_title and (not language or util.startswith(language, "en")) then csl_value = latex_parser.latex_to_sentence_case_pseudo_html(value, keep_unknown_commands, case_protection, check_sentence_case) else csl_value = latex_parser.latex_to_pseudo_html(value, keep_unknown_commands, case_protection) end else -- 1. unicode 2. html tag csl_value = latex_parser.latex_to_pseudo_html(value, keep_unknown_commands, case_protection) if csl_field == "volume" or csl_field == "page" then csl_value = string.gsub(csl_value, util.unicode["en dash"], "-") end end return csl_field, csl_value end local function clean_name_part(name_part) if not name_part then return nil end return string.gsub(name_part, "[{}]", "") end function bibtex2csl.convert_to_csl_name(bibtex_name) if bibtex_name.last and not (bibtex_name.first or bibtex_name.von or bibtex_name.jr) and string.match(bibtex_name.last, "^%b{}$") then return { literal = string.sub(bibtex_name.last, 2, -2), } end local csl_name = { family = clean_name_part(bibtex_name.last), ["non-dropping-particle"] = clean_name_part(bibtex_name.von), given = clean_name_part(bibtex_name.first), suffix = clean_name_part(bibtex_name.jr), } return csl_name end local arxiv_url_prefix = "https://arxiv.org/abs/" local pubmed_url_prefix = "https://www.ncbi.nlm.nih.gov/pubmed/" local pubmed_central_url_prefix = "https://www.ncbi.nlm.nih.gov/pmc/articles/" ---@param item CslItem ---@param entry BibtexEntry function bibtex2csl.post_process_special_fields(item, entry, disable_journal_abbreviation) local bib_type = entry.type local bib_fields = entry.fields -- biblatex-apa if item.type == "article-journal" and entry.fields.entrysubtype == "nonacademic" then item.type = "article-magazine" item.genre = nil elseif item.type == "motion_picture" then if entry.fields.entrysubtype == "tvseries" then item.type = "broadcast" if item.genre == "tvseries" then item.genre = "TV series" end elseif entry.fields.entrysubtype == "tvepisode" then item.type = "broadcast" if item.genre == "tvepisode" then item.genre = "TV series episode" end end elseif item.type == "song" then if entry.fields.entrysubtype == "podcast" then item.type = "broadcast" if item.genre == "podcast" then item.genre = "Audio podcast" end elseif entry.fields.entrysubtype == "podcastepisode" then item.type = "broadcast" if item.genre == "podcastepisode" then item.genre = "Audio podcast episode" end elseif entry.fields.entrysubtype == "interview" then item.type = "interview" elseif entry.fields.entrysubtype == "speech" then item.type = "speech" end elseif item.type == "graphic" then item["archive-place"] = item["publisher-place"] if entry.fields.entrysubtype == "map" then item.type = "map" end elseif item.type == "webpage" then -- eprinttype is mapped to -- - `archive` for Google books; -- - `container-title` for twitter post; -- - `publisher` for arXiv preprint; local eprint_type_map = { facebook = "post", instagram = "post", reddit = "post", twitter = "post", arxiv = "preprint", psyarxiv = "preprint", pubmed = "preprint", ["pubmed central"] = "preprint", } -- Biblatex's `online` type can be mapped to `post`, `preprint` -- local eprinttype = entry.fields.eprinttype or entry.fields.archiveprefix if item.archive then local eprint_type = eprint_type_map[string.lower(item.archive)] if eprint_type then item.type = eprint_type elseif item.number then item.type = "preprint" elseif entry.fields.eprint then item.type = "preprint" item.numerb = entry.fields.eprint elseif item.DOI then item.type = "preprint" elseif item.URL then if util.startswith(item.URL, arxiv_url_prefix) then item.type = "preprint" elseif util.startswith(item.URL, pubmed_url_prefix) then item.type = "preprint" elseif util.startswith(item.URL, pubmed_central_url_prefix) then item.type = "preprint" end end if item.type == "preprint" then if not item.publisher then item.publisher = item.archive item.archive = nil end if not item.number then item.number = entry.fields.eprint end else if not item["container-title"] then item["container-title"] = item.archive item.archive = nil end end end end -- event-date if item["event-date"] and not item.issued then item.issued = util.deep_copy(item["event-date"]) end -- event-title: for compatibility with CSL v1.0.1 and earlier versions if item["event-title"] then item.event = item["event-title"] end -- Jounal abbreviations if not disable_journal_abbreviation then if item.type == "article-journal" or item.type == "article-magazine" or item.type == "article-newspaper" then bibtex2csl.check_journal_abbreviations(item) end end if not item.genre then if bib_type == "phdthesis" then -- from APA item.genre = "Doctoral dissertation" elseif bib_type == "mastersthesis" then item.genre = "Master’s thesis" end end -- month -- local month = bib_fields.month local month_text = bib_fields.month if month_text then month_text = latex_parser.latex_to_pseudo_html(month_text, false, false) local month, day = uni_utf8.match(month_text, "^(%a+)%.?,?%s+(%d+)%a*$") if not month then day, month = uni_utf8.match(month_text, "^(%d+)%a*%s+(%a+)%.?$") end if not month then month = string.match(month_text, "^(%a+)%.?$") end if month then month = bibtex_data.months[unicode.casefold(month)] end if month and item.issued and item.issued["date-parts"] and item.issued["date-parts"][1] and item.issued["date-parts"][1][2] == nil then item.issued["date-parts"][1][2] = tonumber(month) if day then item.issued["date-parts"][1][3] = tonumber(day) end end end -- number if bib_fields.number then if item.type == "article-journal" or item.type == "article-magazine" or item.type == "article-newspaper" or item.type == "periodical" then if not item.issue then item.issue = bib_fields.number end elseif item["collection-title"] and not item["collection-number"] then item["collection-number"] = bib_fields.number elseif not item.number then item.number = string.gsub(bib_fields.number, "([^-])%-([^-])", "%1\\-%2") end end -- organization: the `organizer` that sponsors a conference or a `publisher` that publishes a `@manual` or `@online`. if bib_fields.organization then if item.publisher or bib_type == "inproceedings" or bib_type == "proceedings" then if not item.organizer then item.organizer = { literal = bib_fields.organization, } end elseif not item.publisher then item.publisher = bib_fields.organization end end -- DOI if type(item.DOI) == "string" then item.DOI = util.remove_prefix(item.DOI, "https://doi.org/") item.DOI = util.remove_prefix(item.DOI, "doi.org/") end -- PMID if bib_fields.eprint and type(bib_fields.eprinttype) == "string" and string.lower(bib_fields.eprinttype) == "pubmed" and not item.PMID then item.PMID = bib_fields.eprint end if item.URL then if item.type == "preprint" and util.startswith(item.URL, arxiv_url_prefix) then if not item.publisher then item.publisher = "arXiv" item.archive = nil end if not item.number then item.number = util.remove_prefix(item.URL, pubmed_url_prefix) end end if util.startswith(item.URL, pubmed_url_prefix) then if not item.publisher then item.publisher = "PubMed" item.archive = nil end if not item.PMID then item.PMID = util.remove_prefix(item.URL, pubmed_url_prefix) end end if util.startswith(item.URL, pubmed_central_url_prefix) then if not item.publisher then item.publisher = "PubMed Central" item.archive = nil end if not item.PMCID then item.PMCID = util.remove_prefix(item.URL, pubmed_central_url_prefix) end end end -- `APA Education [@APAEducation], (2018, June 29). College students are forming menta/-health c/ubs-and they're making a difference @washingtonpost [Thumbnail with link attached]` -- The `[Thumbnail with link attached]` should not be italicized. -- if bib_fields.titleaddon and item.genre and item.genre ~= bib_fields.titleaddon then -- if item.title then -- item.title = string.format("%s [%s]", item.title, bib_fields.titleaddon) -- end -- end end function bibtex2csl.check_journal_abbreviations(item) if item["container-title"] and not item["container-title-short"] then if not journal_data then if using_luatex then journal_data = require("citeproc-journal-data") else journal_data = require("citeproc.journal-data") end end local key = unicode.casefold(string.gsub(item["container-title"], "%.", "")) local abbr = journal_data.abbrevs[key] if abbr then item["container-title-short"] = abbr else local full = journal_data.unabbrevs[key] if full then item["container-title-short"] = item["container-title"] item["container-title"] = full end end end end local original_field_dict = { author = "original-author", issued = "original-date", publisher = "original-publisher", ["publisher-place"] = "original-publisher-place", title = "original-title", } local reviewed_field_dict = { author = "reviewed-author", genre = "reviewed-genre", title = "reviewed-title", } ---@param csl_item_dict table ---@param bib_entry_dict table function bibtex2csl.resolve_related(csl_item_dict, bib_entry_dict) for _, entry in pairs(bib_entry_dict) do local related_key = entry.fields.related local related_type = entry.fields.relatedtype if related_key then local related_bib_entry = bib_entry_dict[unicode.casefold(related_key)] if related_bib_entry then local csl_item = csl_item_dict[entry.key] local related_csl_item = csl_item_dict[related_bib_entry.key] if related_type == "reprintof" or related_type == "reprintfrom" then for original_field, new_field in pairs(original_field_dict) do if not csl_item[new_field] and related_csl_item[original_field] then csl_item[new_field] = util.deep_copy(related_csl_item[original_field]) end end elseif related_type == "translationof" or related_type == "translationfrom" then for original_field, new_field in pairs(original_field_dict) do if not csl_item[new_field] and related_csl_item[original_field] then csl_item[new_field] = util.deep_copy(related_csl_item[original_field]) end end elseif related_type == "reviewof" then for reviewed_field, new_field in pairs(reviewed_field_dict) do if not csl_item[new_field] and related_csl_item[reviewed_field] then csl_item[new_field] = util.deep_copy(related_csl_item[reviewed_field]) end end end else util.warning(string.format('Related entry "%s" of "%s" not found.', related_key, entry.key)) end end end end return bibtex2csl