Coverage for src/crawler/by_source/dmlpl_crawler.py: 74%

155 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import json 

2from urllib import parse 

3 

4from bs4 import BeautifulSoup, Tag 

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.by_source.lofpl_crawler import LofplCrawler 

9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

10 

11 

12class DmlplCrawler(BaseCollectionCrawler): 

13 source_name = "The Polish Digital Mathematics Library" 

14 source_domain = "DMLPL" 

15 source_website = "http://pldml.icm.edu.pl/pldml" 

16 

17 periode_begin = 0 

18 periode_end = 9999 

19 

20 # HACK : Workaround for tests (monkeypatching) 

21 # We store the class here, so we can monkeypatch it when running tests 

22 subCrawlers = {LofplCrawler: None} 

23 

24 def parse_collection_content(self, content): 

25 """ 

26 Parse the HTML page of Annals of Math and returns a list of xissue. 

27 Each xissue has its pid/volume/number/year metadata + its url 

28 

29 self.periode is set at the end based on the xissue years of the HTML page 

30 """ 

31 issues = [] 

32 data = json.loads(content) 

33 for entry in data: 

34 link = self.source_website + "/tree/hierarchy.action" 

35 params = {"root": entry["id"]} 

36 link += "?" + parse.urlencode(params) 

37 

38 text: str = entry["text"] 

39 if not text.startswith("tom/rocznik"): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 raise ValueError( 

41 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string' 

42 ) 

43 soup = BeautifulSoup(text, "html.parser") 

44 a_tags = soup.select("a") 

45 if len(a_tags) < 2: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Cannot parse Collection : couldn't find volume information") 

47 volume = a_tags[0].text 

48 year = a_tags[1].text 

49 

50 issues.extend(self.parse_dmlpl_volume_content(link, year, volume)) 

51 return issues 

52 

53 def parse_dmlpl_volume_content(self, link, year, volume): 

54 content = self.download_file(link) 

55 has_articles = False 

56 issues = [] 

57 data = json.loads(content) 

58 for entry in data: 

59 entry_link = self.source_website + "/tree/hierarchy.action" 

60 params = {"root": entry["id"]} 

61 entry_link += "?" + parse.urlencode(params) 

62 

63 number = None 

64 text: str = entry["text"] 

65 if text.startswith("numer"): 

66 soup = BeautifulSoup(text, "html.parser") 

67 a_tag = soup.select_one("a") 

68 if not a_tag: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 raise ValueError("Cannot parse Collection : couldn't find issue information") 

70 number = a_tag.text.replace(" ", "_") 

71 issues.append(self.create_xissue(entry_link, year, volume, number)) 

72 elif text.startswith("artykuł"): 72 ↛ 58line 72 didn't jump to line 58 because the condition on line 72 was always true

73 has_articles = True 

74 

75 if has_articles: 

76 issues.append(self.create_xissue(link, year, volume)) 

77 

78 return issues 

79 

80 def parse_issue_content(self, content, xissue): 

81 data = json.loads(content) 

82 for index, entry in enumerate(data): 

83 xarticle = create_articledata() 

84 xarticle.pid = "a" + str(index) 

85 xarticle.url = self.source_website + "/element/" + entry["id"] 

86 xissue.articles.append(xarticle) 

87 

88 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki) 

89 def crawl_article(self, xarticle, xissue): 

90 parsed_xarticle = xarticle 

91 if hasattr(xarticle, "url") and xarticle.url: 91 ↛ 108line 91 didn't jump to line 108 because the condition on line 91 was always true

92 response = self.get(xarticle.url) 

93 

94 # Crawl using LOFPL if detected 

95 if response.url.startswith("https://bibliotekanauki.pl"): 

96 xarticle.url = response.url.replace( 

97 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api" 

98 ) 

99 targetCrawler = self.subCrawlers[LofplCrawler] 

100 if targetCrawler is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 raise ValueError("Crawler incorrectly initialized") 

102 parsed_xarticle = targetCrawler.crawl_article(xarticle, xissue) 

103 elif response.url.startswith("http://pldml.icm.edu.pl"): 103 ↛ 106line 103 didn't jump to line 106 because the condition on line 103 was always true

104 parsed_xarticle = super().crawl_article(xarticle, xissue) 

105 else: 

106 raise NotImplementedError 

107 

108 if not parsed_xarticle: 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 raise ValueError("Couldn't crawl article") 

110 # The article title may have formulas surrounded with '$' 

111 return self.process_article_metadata(parsed_xarticle) 

112 

113 def parse_dmlpl_generic_page(self, content: str): 

114 soup = BeautifulSoup(content, "html.parser") 

115 main = soup.select_one("div.details-content") 

116 if not main: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 raise ValueError("Cannot parse article : main div not found") 

118 

119 sections = main.select("div.row") 

120 sections_dict: dict[str, Tag] = {} 

121 for s in sections: 

122 row_label = s.select_one("div.row-label") 

123 if not row_label: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 raise ValueError("Cannot parse article : row label not found") 

125 tag = s.select_one("div.row-desc") 

126 if tag: 126 ↛ 121line 126 didn't jump to line 121 because the condition on line 126 was always true

127 sections_dict[row_label.text] = tag 

128 

129 return sections_dict 

130 

131 def parse_article_content(self, content, xissue, xarticle, url, pid): 

132 sections_dict = self.parse_dmlpl_generic_page(content) 

133 

134 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text) 

135 xarticle.pid = pid 

136 

137 # Author 

138 for a_tag in sections_dict["Autorzy"].select("a"): 

139 href = a_tag.get("href") 

140 if not isinstance(href, str): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 raise ValueError("author href is not a string") 

142 author = self.parse_author(self.download_file(self.source_website + "/" + href)) 

143 author["role"] = "author" 

144 xarticle.contributors.append(author) 

145 

146 # TODO : Contributor ? (Twórcy) 

147 

148 # PDF 

149 if "Treść / Zawartość" in sections_dict: 149 ↛ 160line 149 didn't jump to line 160 because the condition on line 149 was always true

150 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a") 

151 if not pdf_a_tag: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 raise ValueError("Cannot find pdf for article") 

153 pdf_url = pdf_a_tag.get("href") 

154 if not isinstance(pdf_url, str): 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 raise ValueError("Cannot parse pdf url for article") 

156 if not pdf_url.startswith("http"): 

157 pdf_url = self.source_website + "/" + pdf_url 

158 add_pdf_link_to_xarticle(xarticle, pdf_url) 

159 else: 

160 print(f"[{pid}]PDF not found") 

161 

162 # Lang 

163 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower()) 

164 if len(xarticle.lang) > 3: 

165 if xarticle.lang == "pl fr": 165 ↛ 169line 165 didn't jump to line 169 because the condition on line 165 was always true

166 xarticle.lang = "pl" 

167 print(f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)") 

168 else: 

169 raise ValueError("Cannot parse article lang") 

170 

171 # Abstract 

172 if "Abstrakty" in sections_dict: 172 ↛ 190line 172 didn't jump to line 190 because the condition on line 172 was always true

173 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row") 

174 for div in abstract_divs: 174 ↛ 175line 174 didn't jump to line 175 because the loop on line 174 never started

175 lang = "und" 

176 lang_div = div.select_one("div.articleDetails-langCell") 

177 if lang_div: 

178 lang = cleanup_str(lang_div.text).lower() 

179 text_div = div.select_one("div.articleDetails-abstract") 

180 if not text_div: 

181 raise ValueError( 

182 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed" 

183 ) 

184 xabstract = create_abstract( 

185 tag="abstract", value_tex=cleanup_str(text_div.text), lang=lang 

186 ) 

187 xarticle.abstracts.append(xabstract) 

188 

189 # Keywords 

190 if "Słowa kluczowe" in sections_dict: 190 ↛ 202line 190 didn't jump to line 202 because the condition on line 190 was always true

191 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row") 

192 for list in keywords_lists: 192 ↛ 193line 192 didn't jump to line 193 because the loop on line 192 never started

193 lang = "und" 

194 lang_div = list.select_one("div.articleDetails-langCell") 

195 keywords_a_tags = list.select("a") 

196 for a_tag in keywords_a_tags: 

197 subject = create_subj() 

198 subject["value"] = a_tag.text 

199 subject["lang"] = lang 

200 xarticle.kwds.append(subject) 

201 # Page 

202 if "Strony" in sections_dict: 

203 self.set_pages(xarticle, cleanup_str(sections_dict["Strony"].text)) 

204 

205 return xarticle 

206 

207 def parse_author(self, content: str): 

208 author = create_contributor() 

209 sections_dict = self.parse_dmlpl_generic_page(content) 

210 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text) 

211 author["first_name"] = cleanup_str(sections_dict["Imię"].text) 

212 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 author["string_name"] = cleanup_str(sections_dict["Twórca"].text) 

214 return author