Coverage for src/crawler/by_source/lofpl_crawler.py: 84%

82 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import json 

2 

3import regex 

4from langcodes import standardize_tag 

5from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

6from ptf.model_data import ( 

7 ArticleData, 

8 IssueData, 

9 create_abstract, 

10 create_articledata, 

11 create_contributor, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle 

16 

17 

18class LofplCrawler(BaseCollectionCrawler): 

19 source_name = "Library of Science" 

20 source_domain = "LOFPL" 

21 source_website = "https://bibliotekanauki.pl" 

22 

23 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)" 

24 

25 def parse_collection_content(self, content): 

26 """ 

27 Parse the HTML page of Annals of Math and returns a list of xissue. 

28 Each xissue has its pid/volume/number/year metadata + its url 

29 """ 

30 issues = [] 

31 data = json.loads(content) 

32 for entry in data: 

33 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles" 

34 year = entry["year"] 

35 volume = entry["volume"] 

36 if entry["number"]: 36 ↛ 39line 36 didn't jump to line 39 because the condition on line 36 was always true

37 number = entry["number"].replace("/", "-") 

38 else: 

39 number = None 

40 issue = self.create_xissue(link, year, volume, number) 

41 issues.append(issue) 

42 return issues 

43 

44 def parse_issue_content(self, content: str, xissue: IssueData): 

45 data = json.loads(content) 

46 for index, entry in enumerate(data): 

47 xarticle = create_articledata() 

48 xarticle.pid = "a" + str(index) 

49 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"]) 

50 xissue.articles.append(xarticle) 

51 

52 def parse_article_content( 

53 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

54 ): 

55 data = json.loads(content) 

56 

57 # used only for prints 

58 pid = f"{xissue.pid}_{xarticle.pid}" 

59 # Titles 

60 xarticle.title_tex = data["mainTitle"]["text"] 

61 xarticle.lang = standardize_tag(data["language"]) 

62 

63 if data["mainTitleTranslations"]: 

64 # PID known to have translations : PMF 

65 print(f"[{pid}]title translations found") 

66 # raise NotImplementedError("title translations found") 

67 

68 # Abstracts 

69 if data["mainAbstract"]["text"] != "": 69 ↛ 78line 69 didn't jump to line 78 because the condition on line 69 was always true

70 xabstract = create_abstract( 

71 tag="abstract", 

72 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]), 

73 lang=standardize_tag(data["mainAbstract"]["language"]), 

74 ) 

75 

76 xarticle.abstracts.append(xabstract) 

77 

78 if data["abstractTranslations"]: 78 ↛ 80line 78 didn't jump to line 80 because the condition on line 78 was never true

79 # PID known to have translations : PMF 

80 print(f"[{pid}]abstract translations found") 

81 

82 if data["pageRange"]: 

83 self.set_pages(xarticle, data["pageRange"]) 

84 

85 # Keywords 

86 for keyword in data["keywords"]: 

87 xarticle.kwds.append( 

88 { 

89 "type": "", 

90 "lang": standardize_tag(keyword["language"]), 

91 "value": keyword["text"], 

92 } 

93 ) 

94 

95 # Authors 

96 for a in data["contributors"]: 

97 if a["role"] != "AUTHOR": 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 raise NotImplementedError("Author role not implemented") 

99 author = create_contributor() 

100 author["role"] = "author" 

101 # Workaround for https://bibliotekanauki.pl/api/articles/2201019 

102 if a["firstName"] and a["lastName"]: 102 ↛ 106line 102 didn't jump to line 106 because the condition on line 102 was always true

103 author["first_name"] = a["firstName"] 

104 author["last_name"] = a["lastName"] 

105 else: 

106 author["string_name"] = a["lastName"] or a["firstName"] 

107 

108 author["corresponding"] = a["corresponding"] 

109 author["orcid"] = a["orcid"] 

110 xarticle.contributors.append(author) 

111 

112 if len(data["bibEntries"]) > 0: 

113 bibitems = [] 

114 

115 for index, bib_entry in enumerate(data["bibEntries"]): 

116 bibitems.append(self.parse_bibitem(bib_entry, index+1)) 

117 

118 xarticle.abstracts.append( 

119 self.create_bibliography( 

120 bibitems 

121 ) 

122 ) 

123 

124 add_pdf_link_to_xarticle( 

125 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf" 

126 ) 

127 return xarticle 

128 

129 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

130 # Fix: set correct article url 

131 article = super().crawl_article(xarticle, xissue) 

132 ext_link = next(link for link in article.ext_links if link["rel"] == "source") 

133 ext_link["location"] = ext_link["location"].replace("/api", "") 

134 return article 

135 

136 def parse_bibitem(self, text: str, index = None): 

137 doi_re = list(regex.finditer(self.doi_regex, text)) 

138 if len(doi_re) == 0: 138 ↛ 140line 138 didn't jump to line 140 because the condition on line 138 was always true

139 return self.create_crawled_bibitem(text, index) 

140 text = regex.sub(self.doi_regex, "", text) 

141 text = text.removesuffix(", ") 

142 for doi_entry in doi_re: 

143 href = doi_entry.group(1) 

144 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi") 

145 

146 return self.create_crawled_bibitem(text, index)