Coverage for src/crawler/by_source/dmlpl_crawler.py: 73%

154 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import json 

2from urllib import parse 

3 

4from bs4 import BeautifulSoup, Tag 

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.by_source.lofpl_crawler import LofplCrawler 

9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

10 

11 

12class DmlplCrawler(BaseCollectionCrawler): 

13 source_name = "The Polish Digital Mathematics Library" 

14 source_domain = "DMLPL" 

15 source_website = "http://pldml.icm.edu.pl/pldml" 

16 

17 # HACK : Workaround for tests (monkeypatching) 

18 # We store the class here, so we can monkeypatch it when running tests 

19 subCrawlers = {LofplCrawler: None} 

20 

21 def parse_collection_content(self, content): 

22 """ 

23 Parse the HTML page of Annals of Math and returns a list of xissue. 

24 Each xissue has its pid/volume/number/year metadata + its url 

25 """ 

26 issues = [] 

27 data = json.loads(content) 

28 for entry in data: 

29 link = self.source_website + "/tree/hierarchy.action" 

30 params = {"root": entry["id"]} 

31 link += "?" + parse.urlencode(params) 

32 

33 text: str = entry["text"] 

34 if not text.startswith("tom/rocznik"): 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 raise ValueError( 

36 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string' 

37 ) 

38 soup = BeautifulSoup(text, "html.parser") 

39 a_tags = soup.select("a") 

40 if len(a_tags) < 2: 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 raise ValueError("Cannot parse Collection : couldn't find volume information") 

42 volume = a_tags[0].text 

43 year = a_tags[1].text 

44 

45 issues.extend(self.parse_dmlpl_volume_content(link, year, volume)) 

46 return issues 

47 

48 def parse_dmlpl_volume_content(self, link, year, volume): 

49 content = self.download_file(link) 

50 has_articles = False 

51 issues = [] 

52 data = json.loads(content) 

53 for entry in data: 

54 entry_link = self.source_website + "/tree/hierarchy.action" 

55 params = {"root": entry["id"]} 

56 entry_link += "?" + parse.urlencode(params) 

57 

58 number = None 

59 text: str = entry["text"] 

60 if text.startswith("numer"): 

61 soup = BeautifulSoup(text, "html.parser") 

62 a_tag = soup.select_one("a") 

63 if not a_tag: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError("Cannot parse Collection : couldn't find issue information") 

65 number = a_tag.text.replace(" ", "_") 

66 issues.append(self.create_xissue(entry_link, year, volume, number)) 

67 elif text.startswith("artykuł"): 67 ↛ 53line 67 didn't jump to line 53 because the condition on line 67 was always true

68 has_articles = True 

69 

70 if has_articles: 

71 issues.append(self.create_xissue(link, year, volume)) 

72 

73 return issues 

74 

75 def parse_issue_content(self, content, xissue): 

76 data = json.loads(content) 

77 for index, entry in enumerate(data): 

78 xarticle = create_articledata() 

79 xarticle.pid = "a" + str(index) 

80 xarticle.url = self.source_website + "/element/" + entry["id"] 

81 xissue.articles.append(xarticle) 

82 

83 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki) 

84 def crawl_article(self, xarticle, xissue): 

85 parsed_xarticle = xarticle 

86 if hasattr(xarticle, "url") and xarticle.url: 86 ↛ 103line 86 didn't jump to line 103 because the condition on line 86 was always true

87 response = self.get(xarticle.url) 

88 

89 # Crawl using LOFPL if detected 

90 if response.url.startswith("https://bibliotekanauki.pl"): 

91 xarticle.url = response.url.replace( 

92 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api" 

93 ) 

94 targetCrawler = self.subCrawlers[LofplCrawler] 

95 if targetCrawler is None: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 raise ValueError("Crawler incorrectly initialized") 

97 parsed_xarticle = targetCrawler.crawl_article(xarticle, xissue) 

98 elif response.url.startswith("http://pldml.icm.edu.pl"): 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was always true

99 parsed_xarticle = super().crawl_article(xarticle, xissue) 

100 else: 

101 raise NotImplementedError 

102 

103 if not parsed_xarticle: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 raise ValueError("Couldn't crawl article") 

105 # The article title may have formulas surrounded with '$' 

106 return self.process_resource_metadata(parsed_xarticle) 

107 

108 def parse_dmlpl_generic_page(self, content: str): 

109 soup = BeautifulSoup(content, "html.parser") 

110 main = soup.select_one("div.details-content") 

111 if not main: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError("Cannot parse article : main div not found") 

113 

114 sections = main.select("div.row") 

115 sections_dict: dict[str, Tag] = {} 

116 for s in sections: 

117 row_label = s.select_one("div.row-label") 

118 if not row_label: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 raise ValueError("Cannot parse article : row label not found") 

120 tag = s.select_one("div.row-desc") 

121 if tag: 121 ↛ 116line 121 didn't jump to line 116 because the condition on line 121 was always true

122 sections_dict[row_label.text] = tag 

123 

124 return sections_dict 

125 

126 def parse_article_content(self, content, xissue, xarticle, url): 

127 sections_dict = self.parse_dmlpl_generic_page(content) 

128 

129 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text) 

130 

131 # Author 

132 for a_tag in sections_dict["Autorzy"].select("a"): 

133 href = a_tag.get("href") 

134 if not isinstance(href, str): 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true

135 raise ValueError("author href is not a string") 

136 author = self.parse_author(self.download_file(self.source_website + "/" + href)) 

137 author["role"] = "author" 

138 xarticle.contributors.append(author) 

139 

140 # TODO : Contributor ? (Twórcy) 

141 

142 # PDF 

143 if "Treść / Zawartość" in sections_dict: 143 ↛ 154line 143 didn't jump to line 154 because the condition on line 143 was always true

144 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a") 

145 if not pdf_a_tag: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 raise ValueError("Cannot find pdf for article") 

147 pdf_url = pdf_a_tag.get("href") 

148 if not isinstance(pdf_url, str): 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true

149 raise ValueError("Cannot parse pdf url for article") 

150 if not pdf_url.startswith("http"): 

151 pdf_url = self.source_website + "/" + pdf_url 

152 add_pdf_link_to_xarticle(xarticle, pdf_url) 

153 else: 

154 print(f"[{xissue.pid}_{xarticle.pid}]PDF not found") 

155 

156 # Lang 

157 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower()) 

158 if len(xarticle.lang) > 3: 

159 if xarticle.lang == "pl fr": 159 ↛ 163line 159 didn't jump to line 163 because the condition on line 159 was always true

160 xarticle.lang = "pl" 

161 print(f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)") 

162 else: 

163 raise ValueError("Cannot parse article lang") 

164 

165 # Abstract 

166 if "Abstrakty" in sections_dict: 166 ↛ 186line 166 didn't jump to line 186 because the condition on line 166 was always true

167 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row") 

168 for div in abstract_divs: 168 ↛ 169line 168 didn't jump to line 169 because the loop on line 168 never started

169 lang = "und" 

170 lang_div = div.select_one("div.articleDetails-langCell") 

171 if lang_div: 

172 lang = cleanup_str(lang_div.text).lower() 

173 text_div = div.select_one("div.articleDetails-abstract") 

174 if not text_div: 

175 raise ValueError( 

176 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed" 

177 ) 

178 abstract_text = cleanup_str(text_div.text) 

179 if abstract_text != "-": 

180 xabstract = create_abstract( 

181 tag="abstract", value_tex=abstract_text, lang=lang 

182 ) 

183 xarticle.abstracts.append(xabstract) 

184 

185 # Keywords 

186 if "Słowa kluczowe" in sections_dict: 186 ↛ 198line 186 didn't jump to line 198 because the condition on line 186 was always true

187 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row") 

188 for list in keywords_lists: 188 ↛ 189line 188 didn't jump to line 189 because the loop on line 188 never started

189 lang = "und" 

190 lang_div = list.select_one("div.articleDetails-langCell") 

191 keywords_a_tags = list.select("a") 

192 for a_tag in keywords_a_tags: 

193 subject = create_subj() 

194 subject["value"] = a_tag.text 

195 subject["lang"] = lang 

196 xarticle.kwds.append(subject) 

197 # Page 

198 if "Strony" in sections_dict: 

199 self.set_pages(xarticle, cleanup_str(sections_dict["Strony"].text)) 

200 

201 return xarticle 

202 

203 def parse_author(self, content: str): 

204 author = create_contributor() 

205 sections_dict = self.parse_dmlpl_generic_page(content) 

206 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text) 

207 author["first_name"] = cleanup_str(sections_dict["Imię"].text) 

208 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0: 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true

209 author["string_name"] = cleanup_str(sections_dict["Twórca"].text) 

210 return author