"""Convert a yaml file to bib file with the correct journal abbreviations."""
from __future__ import annotations
import contextlib
import glob
import os
from typing import Any
import click
import crossref.restful
import diskcache
import requests
import yaml
from pylatexenc.latexencode import unicode_to_latex
from tqdm import tqdm
def pages_from_crossref(data: dict[str, Any]) -> str:
try:
page = data["article-number"]
except KeyError:
if "page" in data:
page = data["page"].split("-")[0]
else:
msg = "No page number found!"
raise KeyError(msg) from None
return page
def journal_from_crossref(data: dict[str, Any]) -> tuple[str, str]:
return data["container-title"][0], data["short-container-title"][0]
def cached_crossref(
doi: str,
works: crossref.restful.Works,
database: str,
) -> dict[str, Any]:
"""Look up if this has previously been called."""
with diskcache.Cache(database) as cache:
info = cache.get(doi)
if info is not None:
return info
info = works.doi(doi)
cache[doi] = info
return info
def replace_key(
key: str,
data: dict[str, Any],
bib_entry: str,
replacements: list[tuple[str, str]],
) -> str:
bib_type = bib_entry.split("{")[0]
bib_context = bib_entry.split(",", maxsplit=1)[1]
# Now only modify `bib_context` because we don't want to touch the key.
# Replace non-ascii characters by LaTeX equivalent
bib_context = unicode_to_latex(bib_context, non_ascii_only=True)
to_replace = replacements.copy()
with contextlib.suppress(Exception):
# Use the journal abbrv. from crossref, not used if hard coded.
to_replace.append(journal_from_crossref(data))
for old, new in to_replace:
bib_context = bib_context.replace(old, new)
result = bib_type + "{" + key + "," + bib_context
if "pages = {" not in result:
# Add the page number if it's missing
with contextlib.suppress(Exception):
pages = pages_from_crossref(data)
lines = result.split("\n")
lines.insert(2, f"\tpages = {{{pages}}},")
result = "\n".join(lines)
return result
def doi2bib(doi: str) -> str:
"""Return a bibTeX string of metadata for a given DOI."""
print(f"Requesting {doi}")
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
r = requests.get(url, headers=headers, timeout=60)
r.encoding = "utf-8"
return r.text
def cached_doi2bib(doi: str, database: str) -> str:
"""Look up if this has previously been called."""
with diskcache.Cache(database) as cache:
text = cache.get(doi)
if text is not None:
return text
text = doi2bib(doi)
if text and "<html>" not in text:
print(f"Succesfully got '{doi}' 🎉")
cache[doi] = text
else:
print(f"Failed on '{doi}' 😢")
return text
def combine_yamls(pathname: str) -> dict[str, str]:
mapping: dict[str, str] = {}
for fname in glob.glob(pathname):
with open(fname) as f:
for k, v in yaml.safe_load(f).items():
# Check that there are no duplicate keys with different DOIs.
if k in mapping:
if v.lower() != mapping[k].lower():
msg = f"{k} exists for multiple DOIs: {v} and {mapping[k]}."
raise KeyError(msg)
else:
mapping[k] = v
return dict(sorted(mapping.items()))
def parse_doi_yaml(fname: str) -> dict[str, str]:
if os.path.isfile(fname):
with open(fname) as f:
return yaml.safe_load(f)
else:
return combine_yamls(fname)
def parse_replacements_yaml(fname: str | None) -> list[tuple[str, str]]:
if fname is None:
return []
with open(fname) as f:
d = yaml.safe_load(f)
all_replacements = []
for replacements in d.values():
for k, v in replacements.items():
all_replacements.append((k, v))
return all_replacements
def write_output(entries: list[str], bib_files: list[str], bib_fname: str) -> None:
with open(bib_fname, "w") as outfile:
outfile.write("@preamble{ {\\providecommand{\\BIBYu}{Yu} } }\n\n")
for fname in bib_files:
outfile.write(f"\n% Below is from `{fname}`.\n\n")
with open(fname) as infile:
outfile.write(infile.read())
outfile.write("\n% Below is from all `yaml` files.\n\n")
for e in entries:
for line in e.split("\n"):
# Remove the url line because LaTeX creates it from the DOI
if "url = {" not in line:
outfile.write(f"{line}\n")
outfile.write("\n")
def static_bib_entries(pathname: str | None) -> list[str]:
if pathname is None:
return []
if os.path.isfile(pathname):
return [pathname]
return glob.glob(pathname)
def get_bib_entries(
dois: dict[str, str],
replacements: list[tuple[str, str]],
doi2bib_database: str,
crossref_database: str,
works: crossref.restful.Works,
) -> list[str]:
return [
replace_key(
key,
data=cached_crossref(doi, works, crossref_database),
bib_entry=cached_doi2bib(doi, doi2bib_database),
replacements=replacements,
)
for key, doi in tqdm(dois.items())
]
[docs]
def yaml2bib(
bib_fname: str,
dois_yaml: str,
*,
replacements_yaml: str | None,
static_bib: str | None,
doi2bib_database: str,
crossref_database: str,
email: str,
) -> None:
r"""Convert a yaml file to bib file with the correct journal abbreviations.
Parameters
----------
bib_fname: str
Output file. (default: ``'dissertation.bib'``)
dois_yaml: str
The ``key: doi`` YAML file, may contain wildcards (``*``).
(default: ``'bib.yaml'``, example: ``'*/*.yaml'``)
replacements_yaml: str
Replacements to perform, might be ``None``.
(default: ``None``, example: ``'replacements.yaml'``)
static_bib: str
Static bib entries, might be ``None``, may contain wildcards (``*``).
(default: ``None``, example: ``'chapter_*/not_on_crossref.bib'``)
doi2bib_database: str
The doi2bib database folder 📁 to not query doi.org more than needed.
(default: ``'yaml2bib-doi2bib.db'``)
crossref_database: str
The Crossref database folder 📁 to not query crossref.org more than needed.
(default: ``'yaml2bib-doi2bib.db'``)
email: str
E-mail 📧 for crossref.org, such that one can make faster API.
(default: ``'anonymous'``, example: ``'bas@nijho.lt'``)
Returns
-------
None
Examples
--------
Example invocation for my `thesis <https://gitlab.kwant-project.org/qt/basnijholt/thesis-bas-nijholt>`_.
.. code-block:: bash
yaml2bib \\
--bib_fname "dissertation.bib" \\
--dois_yaml "*/*.yaml" \\
--replacements_yaml "replacements.yaml" \\
--static_bib "chapter_*/not_on_crossref.bib" \\
--email "bas@nijho.lt"
"""
etiquette = crossref.restful.Etiquette("publist", contact_email=email)
works = crossref.restful.Works(etiquette=etiquette)
dois = parse_doi_yaml(dois_yaml)
replacements = parse_replacements_yaml(replacements_yaml)
entries = get_bib_entries(
dois,
replacements,
doi2bib_database,
crossref_database,
works,
)
bib_files = static_bib_entries(static_bib)
write_output(entries, bib_files, bib_fname)
@click.command()
@click.option(
"--bib_fname",
default="dissertation.bib",
help="Output file. (default: 'dissertation.bib')",
)
@click.option(
"--dois_yaml",
default="bib.yaml",
help=(
"The `key: doi` YAML file, may contain wildcards (*). "
"(default: 'bib.yaml', example: '*/*.yaml')"
),
)
@click.option(
"--replacements_yaml",
default=None,
help=(
"Replacements to perform, might be None. "
"(default: None, example: 'replacements.yaml')"
),
)
@click.option(
"--static_bib",
default=None,
help=(
"Static bib entries, might be None, may contain wildcards (*). "
"(default: None, example: 'chapter_*/not_on_crossref.bib')"
),
)
@click.option(
"--doi2bib_database",
default="yaml2bib-doi2bib.db",
help=(
"The doi2bib database folder 📁 to not query doi.org more than needed. "
"(default: 'yaml2bib-doi2bib.db')"
),
)
@click.option(
"--crossref_database",
default="yaml2bib-crossref.db",
help=(
"The Crossref database folder 📁 to not query crossref.org more than needed. "
"(default: 'yaml2bib-doi2bib.db')"
),
)
@click.option(
"--email",
default="anonymous",
help=(
"E-mail 📧 for crossref.org, such that one can make more API calls "
"without getting blocked. (default: 'anonymous', example: 'bas@nijho.lt')"
),
)
def cli(
bib_fname: str,
dois_yaml: str,
replacements_yaml: str,
static_bib: str,
doi2bib_database: str,
crossref_database: str,
email: str,
) -> None:
click.echo(
"Convert a yaml file to bib file with the correct journal abbreviations.",
)
yaml2bib(
bib_fname=bib_fname,
dois_yaml=dois_yaml,
replacements_yaml=replacements_yaml,
static_bib=static_bib,
doi2bib_database=doi2bib_database,
crossref_database=crossref_database,
email=email,
)
if __name__ == "__main__":
cli()