Coverage for src/crawler/by_source/lofpl_crawler.py: 84%

83 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1import json 

2 

3import regex 

4from langcodes import standardize_tag 

5from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

6from ptf.cmds.xml.jats.jats_parser import JatsBase 

7from ptf.model_data import ( 

8 ArticleData, 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import add_pdf_link_to_xarticle 

17 

18 

19class LofplCrawler(BaseCollectionCrawler): 

20 source_name = "Library of Science" 

21 source_domain = "LOFPL" 

22 source_website = "https://bibliotekanauki.pl" 

23 

24 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)" 

25 

26 def parse_collection_content(self, content): 

27 """ 

28 Parse the HTML page of Annals of Math and returns a list of xissue. 

29 Each xissue has its pid/volume/number/year metadata + its url 

30 """ 

31 issues = [] 

32 data = json.loads(content) 

33 for entry in data: 

34 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles" 

35 year = entry["year"] 

36 volume = entry["volume"] 

37 if entry["number"]: 37 ↛ 40line 37 didn't jump to line 40 because the condition on line 37 was always true

38 number = entry["number"].replace("/", "-") 

39 else: 

40 number = None 

41 issue = self.create_xissue(link, year, volume, number) 

42 issues.append(issue) 

43 return issues 

44 

45 def parse_issue_content(self, content: str, xissue: IssueData): 

46 data = json.loads(content) 

47 for index, entry in enumerate(data): 

48 xarticle = create_articledata() 

49 xarticle.pid = "a" + str(index) 

50 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"]) 

51 xissue.articles.append(xarticle) 

52 

53 def parse_article_content( 

54 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

55 ): 

56 data = json.loads(content) 

57 

58 # used only for prints 

59 pid = f"{xissue.pid}_{xarticle.pid}" 

60 # Titles 

61 xarticle.title_tex = data["mainTitle"]["text"] 

62 xarticle.lang = standardize_tag(data["language"]) 

63 

64 if data["mainTitleTranslations"]: 

65 # PID known to have translations : PMF 

66 print(f"[{pid}]title translations found") 

67 # raise NotImplementedError("title translations found") 

68 

69 # Abstracts 

70 if data["mainAbstract"]["text"] != "": 70 ↛ 79line 70 didn't jump to line 79 because the condition on line 70 was always true

71 xabstract = create_abstract( 

72 tag="abstract", 

73 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]), 

74 lang=standardize_tag(data["mainAbstract"]["language"]), 

75 ) 

76 

77 xarticle.abstracts.append(xabstract) 

78 

79 if data["abstractTranslations"]: 79 ↛ 81line 79 didn't jump to line 81 because the condition on line 79 was never true

80 # PID known to have translations : PMF 

81 print(f"[{pid}]abstract translations found") 

82 

83 if data["pageRange"]: 

84 self.set_pages(xarticle, data["pageRange"]) 

85 

86 # Keywords 

87 for keyword in data["keywords"]: 

88 xarticle.kwds.append( 

89 { 

90 "type": "", 

91 "lang": standardize_tag(keyword["language"]), 

92 "value": keyword["text"], 

93 } 

94 ) 

95 

96 # Authors 

97 for a in data["contributors"]: 

98 if a["role"] != "AUTHOR": 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 raise NotImplementedError("Author role not implemented") 

100 author = create_contributor() 

101 author["role"] = "author" 

102 # Workaround for https://bibliotekanauki.pl/api/articles/2201019 

103 if a["firstName"] and a["lastName"]: 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was always true

104 author["first_name"] = a["firstName"] 

105 author["last_name"] = a["lastName"] 

106 else: 

107 author["string_name"] = a["lastName"] or a["firstName"] 

108 

109 author["corresponding"] = a["corresponding"] 

110 author["orcid"] = a["orcid"] 

111 xarticle.contributors.append(author) 

112 

113 if len(data["bibEntries"]) > 0: 

114 bibitems = [] 

115 

116 for index, bib_entry in enumerate(data["bibEntries"]): 

117 bibitems.append(self.parse_bibitem(bib_entry, index + 1)) 

118 

119 xarticle.abstracts.append(JatsBase.compile_refs(bibitems)) 

120 

121 add_pdf_link_to_xarticle( 

122 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf" 

123 ) 

124 return xarticle 

125 

126 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

127 # Fix: set correct article url 

128 article = super().crawl_article(xarticle, xissue) 

129 ext_link = next(link for link in article.ext_links if link["rel"] == "source") 

130 ext_link["location"] = ext_link["location"].replace("/api", "") 

131 return article 

132 

133 def parse_bibitem(self, text: str, index=None): 

134 doi_re = list(regex.finditer(self.doi_regex, text)) 

135 if len(doi_re) == 0: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true

136 return JatsBase.bake_ref(text, str(index)) 

137 text = regex.sub(self.doi_regex, "", text) 

138 text = text.removesuffix(", ") 

139 for doi_entry in doi_re: 

140 href = doi_entry.group(1) 

141 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi") 

142 

143 return JatsBase.bake_ref(text, index)