Coverage for src/crawler/by_source/dmlpl_crawler.py: 76%

164 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import json 

2from urllib import parse 

3 

4from bs4 import BeautifulSoup, Tag 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_extlink, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.by_source.lofpl_crawler import LofplCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

16 

17 

18class DmlplCrawler(BaseCollectionCrawler): 

19 source_name = "The Polish Digital Mathematics Library" 

20 source_domain = "DMLPL" 

21 source_website = "http://pldml.icm.edu.pl/pldml" 

22 

23 periode_begin = 0 

24 periode_end = 9999 

25 

26 # HACK : Workaround for tests (monkeypatching) 

27 # We store the class here, so we can monkeypatch it when running tests 

28 subCrawlers = {LofplCrawler: None} 

29 

30 def parse_collection_content(self, content): 

31 """ 

32 Parse the HTML page of Annals of Math and returns a list of xissue. 

33 Each xissue has its pid/volume/number/year metadata + its url 

34 

35 self.periode is set at the end based on the xissue years of the HTML page 

36 """ 

37 issues = [] 

38 data = json.loads(content) 

39 for entry in data: 

40 link = self.source_website + "/tree/hierarchy.action" 

41 params = {"root": entry["id"]} 

42 link += "?" + parse.urlencode(params) 

43 

44 text: str = entry["text"] 

45 if not text.startswith("tom/rocznik"): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError( 

47 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string' 

48 ) 

49 soup = BeautifulSoup(text, "html.parser") 

50 a_tags = soup.select("a") 

51 if len(a_tags) < 2: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError("Cannot parse Collection : couldn't find volume information") 

53 volume = a_tags[0].text 

54 year = a_tags[1].text 

55 

56 issues.extend(self.parse_dmlpl_volume_content(link, year, volume)) 

57 return issues 

58 

59 def parse_dmlpl_volume_content(self, link, year, volume): 

60 content = self.download_file(link) 

61 has_articles = False 

62 issues = [] 

63 data = json.loads(content) 

64 for entry in data: 

65 entry_link = self.source_website + "/tree/hierarchy.action" 

66 params = {"root": entry["id"]} 

67 entry_link += "?" + parse.urlencode(params) 

68 

69 number = None 

70 text: str = entry["text"] 

71 if text.startswith("numer"): 

72 soup = BeautifulSoup(text, "html.parser") 

73 a_tag = soup.select_one("a") 

74 if not a_tag: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 raise ValueError("Cannot parse Collection : couldn't find issue information") 

76 number = a_tag.text.replace(" ", "_") 

77 issues.append(self.create_xissue(entry_link, year, volume, number)) 

78 elif text.startswith("artykuł"): 78 ↛ 64line 78 didn't jump to line 64 because the condition on line 78 was always true

79 has_articles = True 

80 

81 if has_articles: 

82 issues.append(self.create_xissue(link, year, volume)) 

83 

84 return issues 

85 

86 def parse_issue_content(self, content, xissue): 

87 data = json.loads(content) 

88 for index, entry in enumerate(data): 

89 xarticle = create_articledata() 

90 xarticle.pid = "a" + str(index) 

91 xarticle.url = self.source_website + "/element/" + entry["id"] 

92 xissue.articles.append(xarticle) 

93 

94 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki) 

95 def crawl_article(self, xarticle, xissue): 

96 parsed_xarticle = xarticle 

97 if hasattr(xarticle, "url") and xarticle.url: 97 ↛ 133line 97 didn't jump to line 133 because the condition on line 97 was always true

98 url = xarticle.url 

99 

100 article_source = self.source_domain 

101 response = self.get(xarticle.url) 

102 content = self.decode_response(response) 

103 pid = f"{xissue.pid}_{xarticle.pid}" 

104 

105 # Crawl using LOFPL if detected 

106 if response.url.startswith("https://bibliotekanauki.pl"): 

107 xarticle.url = response.url.replace( 

108 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api" 

109 ) 

110 content = self.download_file(xarticle.url) 

111 targetCrawler = self.subCrawlers[LofplCrawler] 

112 if targetCrawler is None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 raise ValueError("Crawler incorrectly initialized") 

114 targetCrawler.parse_article_content( 

115 content, xissue, xarticle, xarticle.url, pid # type: ignore 

116 ) 

117 article_source = targetCrawler.source_domain 

118 elif response.url.startswith("http://pldml.icm.edu.pl"): 118 ↛ 123line 118 didn't jump to line 123 because the condition on line 118 was always true

119 parsed_xarticle = self.parse_article_content( 

120 content, xissue, xarticle, xarticle.url, pid 

121 ) 

122 else: 

123 raise NotImplementedError 

124 

125 # ARTICLE URL as en ExtLink (to display the link in the article page) 

126 ext_link = create_extlink() 

127 ext_link["rel"] = "source" 

128 ext_link["location"] = url 

129 ext_link["metadata"] = article_source 

130 parsed_xarticle.ext_links.append(ext_link) 

131 

132 # The article title may have formulas surrounded with '$' 

133 return self.process_article_metadata(parsed_xarticle) 

134 

135 def parse_dmlpl_generic_page(self, content: str): 

136 soup = BeautifulSoup(content, "html.parser") 

137 main = soup.select_one("div.details-content") 

138 if not main: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 raise ValueError("Cannot parse article : main div not found") 

140 

141 sections = main.select("div.row") 

142 sections_dict: dict[str, Tag] = {} 

143 for s in sections: 

144 row_label = s.select_one("div.row-label") 

145 if not row_label: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 raise ValueError("Cannot parse article : row label not found") 

147 tag = s.select_one("div.row-desc") 

148 if tag: 148 ↛ 143line 148 didn't jump to line 143 because the condition on line 148 was always true

149 sections_dict[row_label.text] = tag 

150 

151 return sections_dict 

152 

153 def parse_article_content(self, content, xissue, xarticle, url, pid): 

154 sections_dict = self.parse_dmlpl_generic_page(content) 

155 

156 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text) 

157 xarticle.pid = pid 

158 

159 # Author 

160 for a_tag in sections_dict["Autorzy"].select("a"): 

161 href = a_tag.get("href") 

162 if not isinstance(href, str): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 raise ValueError("author href is not a string") 

164 author = self.parse_author(self.download_file(self.source_website + "/" + href)) 

165 author["role"] = "author" 

166 xarticle.contributors.append(author) 

167 

168 # TODO : Contributor ? (Twórcy) 

169 

170 # PDF 

171 if "Treść / Zawartość" in sections_dict: 171 ↛ 182line 171 didn't jump to line 182 because the condition on line 171 was always true

172 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a") 

173 if not pdf_a_tag: 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 raise ValueError("Cannot find pdf for article") 

175 pdf_url = pdf_a_tag.get("href") 

176 if not isinstance(pdf_url, str): 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true

177 raise ValueError("Cannot parse pdf url for article") 

178 if not pdf_url.startswith("http"): 

179 pdf_url = self.source_website + "/" + pdf_url 

180 add_pdf_link_to_xarticle(xarticle, pdf_url) 

181 else: 

182 print(f"[{pid}]PDF not found") 

183 

184 # Lang 

185 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower()) 

186 if len(xarticle.lang) > 3: 

187 if xarticle.lang == "pl fr": 187 ↛ 191line 187 didn't jump to line 191 because the condition on line 187 was always true

188 xarticle.lang = "pl" 

189 print(f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)") 

190 else: 

191 raise ValueError("Cannot parse article lang") 

192 

193 # Abstract 

194 if "Abstrakty" in sections_dict: 194 ↛ 212line 194 didn't jump to line 212 because the condition on line 194 was always true

195 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row") 

196 for div in abstract_divs: 196 ↛ 197line 196 didn't jump to line 197 because the loop on line 196 never started

197 lang = "und" 

198 lang_div = div.select_one("div.articleDetails-langCell") 

199 if lang_div: 

200 lang = cleanup_str(lang_div.text).lower() 

201 text_div = div.select_one("div.articleDetails-abstract") 

202 if not text_div: 

203 raise ValueError( 

204 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed" 

205 ) 

206 xabstract = create_abstract( 

207 tag="abstract", value_tex=cleanup_str(text_div.text), lang=lang 

208 ) 

209 xarticle.abstracts.append(xabstract) 

210 

211 # Keywords 

212 if "Słowa kluczowe" in sections_dict: 212 ↛ 224line 212 didn't jump to line 224 because the condition on line 212 was always true

213 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row") 

214 for list in keywords_lists: 214 ↛ 215line 214 didn't jump to line 215 because the loop on line 214 never started

215 lang = "und" 

216 lang_div = list.select_one("div.articleDetails-langCell") 

217 keywords_a_tags = list.select("a") 

218 for a_tag in keywords_a_tags: 

219 subject = create_subj() 

220 subject["value"] = a_tag.text 

221 subject["lang"] = lang 

222 xarticle.kwds.append(subject) 

223 # Page 

224 if "Strony" in sections_dict: 

225 self.set_pages(xarticle, cleanup_str(sections_dict["Strony"].text)) 

226 

227 return xarticle 

228 

229 def parse_author(self, content: str): 

230 author = create_contributor() 

231 sections_dict = self.parse_dmlpl_generic_page(content) 

232 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text) 

233 author["first_name"] = cleanup_str(sections_dict["Imię"].text) 

234 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true

235 author["string_name"] = cleanup_str(sections_dict["Twórca"].text) 

236 return author