Coverage for src / crawler / by_source / lofpl_crawler.py: 81%

82 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import json 

2 

3import regex 

4from langcodes import standardize_tag 

5from ptf.cmds.xml.jats.builder.references import get_ext_link_xml 

6from ptf.cmds.xml.jats.jats_parser import JatsBase 

7from ptf.model_data import ( 

8 ArticleData, 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.crawler_utils import set_pages 

17from crawler.utils import add_pdf_link_to_xarticle 

18 

19 

20class LofplCrawler(BaseCollectionCrawler): 

21 source_name = "Library of Science" 

22 source_domain = "LOFPL" 

23 source_website = "https://bibliotekanauki.pl" 

24 

25 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)" 

26 

27 def parse_collection_content(self, content): 

28 """ 

29 Parse the HTML page of Annals of Math and returns a list of xissue. 

30 Each xissue has its pid/volume/number/year metadata + its url 

31 """ 

32 issues = [] 

33 data = json.loads(content) 

34 for entry in data: 

35 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles" 

36 year = entry["year"] 

37 volume = entry["volume"] 

38 if entry["number"]: 38 ↛ 41line 38 didn't jump to line 41 because the condition on line 38 was always true

39 number = entry["number"].replace("/", "-") 

40 else: 

41 number = None 

42 issue = self.create_xissue(link, year, volume, number) 

43 issues.append(issue) 

44 return issues 

45 

46 def parse_issue_content(self, content: str, xissue: IssueData): 

47 data = json.loads(content) 

48 for index, entry in enumerate(data): 

49 xarticle = create_articledata() 

50 xarticle.pid = "a" + str(index) 

51 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"]) 

52 xissue.articles.append(xarticle) 

53 

54 def parse_article_content( 

55 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

56 ): 

57 data = json.loads(content) 

58 

59 # used only for prints 

60 pid = f"{xissue.pid}_{xarticle.pid}" 

61 # Titles 

62 xarticle.title_tex = data["mainTitle"]["text"] 

63 xarticle.lang = standardize_tag(data["language"]) 

64 

65 if data["mainTitleTranslations"]: 65 ↛ 67line 65 didn't jump to line 67 because the condition on line 65 was never true

66 # PID known to have translations : PMF 

67 self.logger.debug("Title translations found", {"pid": pid}) 

68 # raise NotImplementedError("title translations found") 

69 

70 # Abstracts 

71 if data["mainAbstract"]["text"] != "": 71 ↛ 79line 71 didn't jump to line 79 because the condition on line 71 was always true

72 xabstract = create_abstract( 

73 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]), 

74 lang=standardize_tag(data["mainAbstract"]["language"]), 

75 ) 

76 

77 xarticle.abstracts.append(xabstract) 

78 

79 if data["abstractTranslations"]: 79 ↛ 81line 79 didn't jump to line 81 because the condition on line 79 was never true

80 # PID known to have translations : PMF 

81 self.logger.debug("Abstract translations found", extra={"pid": pid}) 

82 

83 if data["pageRange"]: 

84 set_pages(xarticle, data["pageRange"]) 

85 

86 # Keywords 

87 for keyword in data["keywords"]: 

88 xarticle.kwds.append( 

89 { 

90 "type": "", 

91 "lang": standardize_tag(keyword["language"]), 

92 "value": keyword["text"], 

93 } 

94 ) 

95 

96 # Authors 

97 for a in data["contributors"]: 

98 if a["role"] != "AUTHOR": 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 raise NotImplementedError("Author role not implemented") 

100 author = create_contributor() 

101 author["role"] = "author" 

102 # Workaround for https://bibliotekanauki.pl/api/articles/2201019 

103 if a["firstName"] and a["lastName"]: 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was always true

104 author["first_name"] = a["firstName"] 

105 author["last_name"] = a["lastName"] 

106 else: 

107 author["string_name"] = a["lastName"] or a["firstName"] 

108 

109 author["corresponding"] = a["corresponding"] 

110 author["orcid"] = a["orcid"] 

111 xarticle.contributors.append(author) 

112 

113 if len(data["bibEntries"]) > 0: 113 ↛ 117line 113 didn't jump to line 117 because the condition on line 113 was always true

114 for index, bib_entry in enumerate(data["bibEntries"]): 

115 xarticle.bibitems.append(self.parse_bibitem(bib_entry, index + 1)) 

116 

117 add_pdf_link_to_xarticle( 

118 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf" 

119 ) 

120 return xarticle 

121 

122 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

123 # Fix: set correct article url 

124 article = super().crawl_article(xarticle, xissue) 

125 ext_link = next(link for link in article.ext_links if link["rel"] == "source") 

126 ext_link["location"] = ext_link["location"].replace("/api", "") 

127 return article 

128 

129 def parse_bibitem(self, text: str, index=None): 

130 doi_re = list(regex.finditer(self.doi_regex, text)) 

131 if len(doi_re) == 0: 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true

132 return JatsBase.bake_ref(text, str(index)) 

133 text = regex.sub(self.doi_regex, "", text) 

134 text = text.removesuffix(", ") 

135 for doi_entry in doi_re: 

136 href = doi_entry.group(1) 

137 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi") 

138 

139 return JatsBase.bake_ref(text, index)