Coverage for src/crawler/by_source/lofpl

1import json

3import regex

4from langcodes import standardize_tag

5from ptf.cmds.xml.jats.builder.references import get_ext_link_xml

6from ptf.cmds.xml.jats.jats_parser import JatsBase

7from ptf.model_data import (

8 ArticleData,

9 IssueData,

10 create_abstract,

11 create_articledata,

12 create_contributor,

13)

15from crawler.base_crawler import BaseCollectionCrawler

16from crawler.utils import add_pdf_link_to_xarticle

19class LofplCrawler(BaseCollectionCrawler):

20 source_name = "Library of Science"

21 source_domain = "LOFPL"

22 source_website = "https://bibliotekanauki.pl"

24 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)"

26 def parse_collection_content(self, content):

27 """

28 Parse the HTML page of Annals of Math and returns a list of xissue.

29 Each xissue has its pid/volume/number/year metadata + its url

30 """

31 issues = []

32 data = json.loads(content)

33 for entry in data:

34 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles"

35 year = entry["year"]

36 volume = entry["volume"]

37 if entry["number"]: 37 ↛ 40line 37 didn't jump to line 40 because the condition on line 37 was always true

38 number = entry["number"].replace("/", "-")

39 else:

40 number = None

41 issue = self.create_xissue(link, year, volume, number)

42 issues.append(issue)

43 return issues

45 def parse_issue_content(self, content: str, xissue: IssueData):

46 data = json.loads(content)

47 for index, entry in enumerate(data):

48 xarticle = create_articledata()

49 xarticle.pid = "a" + str(index)

50 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"])

51 xissue.articles.append(xarticle)

53 def parse_article_content(

54 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str

55 ):

56 data = json.loads(content)

58 # used only for prints

59 pid = f"{xissue.pid}_{xarticle.pid}"

60 # Titles

61 xarticle.title_tex = data["mainTitle"]["text"]

62 xarticle.lang = standardize_tag(data["language"])

64 if data["mainTitleTranslations"]: 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was never true

65 # PID known to have translations : PMF

66 self.logger.debug("Title translations found", {"pid": pid})

67 # raise NotImplementedError("title translations found")

69 # Abstracts

70 if data["mainAbstract"]["text"] != "": 70 ↛ 78line 70 didn't jump to line 78 because the condition on line 70 was always true

71 xabstract = create_abstract(

72 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]),

73 lang=standardize_tag(data["mainAbstract"]["language"]),

74 )

76 xarticle.abstracts.append(xabstract)

78 if data["abstractTranslations"]: 78 ↛ 80line 78 didn't jump to line 80 because the condition on line 78 was never true

79 # PID known to have translations : PMF

80 self.logger.debug("Abstract translations found", extra={"pid": pid})

82 if data["pageRange"]:

83 self.set_pages(xarticle, data["pageRange"])

85 # Keywords

86 for keyword in data["keywords"]:

87 xarticle.kwds.append(

88 {

89 "type": "",

90 "lang": standardize_tag(keyword["language"]),

91 "value": keyword["text"],

92 }

93 )

95 # Authors

96 for a in data["contributors"]:

97 if a["role"] != "AUTHOR": 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 raise NotImplementedError("Author role not implemented")

99 author = create_contributor()

100 author["role"] = "author"

101 # Workaround for https://bibliotekanauki.pl/api/articles/2201019

102 if a["firstName"] and a["lastName"]: 102 ↛ 106line 102 didn't jump to line 106 because the condition on line 102 was always true

103 author["first_name"] = a["firstName"]

104 author["last_name"] = a["lastName"]

105 else:

106 author["string_name"] = a["lastName"] or a["firstName"]

107

108 author["corresponding"] = a["corresponding"]

109 author["orcid"] = a["orcid"]

110 xarticle.contributors.append(author)

111

112 if len(data["bibEntries"]) > 0: 112 ↛ 116line 112 didn't jump to line 116 because the condition on line 112 was always true

113 for index, bib_entry in enumerate(data["bibEntries"]):

114 xarticle.bibitems.append(self.parse_bibitem(bib_entry, index + 1))

115

116 add_pdf_link_to_xarticle(

117 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf"

118 )

119 return xarticle

120

121 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):

122 # Fix: set correct article url

123 article = super().crawl_article(xarticle, xissue)

124 ext_link = next(link for link in article.ext_links if link["rel"] == "source")

125 ext_link["location"] = ext_link["location"].replace("/api", "")

126 return article

127

128 def parse_bibitem(self, text: str, index=None):

129 doi_re = list(regex.finditer(self.doi_regex, text))

130 if len(doi_re) == 0: 130 ↛ 132line 130 didn't jump to line 132 because the condition on line 130 was always true

131 return JatsBase.bake_ref(text, str(index))

132 text = regex.sub(self.doi_regex, "", text)

133 text = text.removesuffix(", ")

134 for doi_entry in doi_re:

135 href = doi_entry.group(1)

136 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi")

137

138 return JatsBase.bake_ref(text, index)

Coverage for src/crawler/by_source/lofpl_crawler.py: 81%

81 statements