Coverage for src / crawler / by_source / dmlpl_crawler.py: 9%

154 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import json 

2from urllib import parse 

3 

4from bs4 import BeautifulSoup, Tag 

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.by_source.lofpl_crawler import LofplCrawler 

9from crawler.crawler_utils import set_pages 

10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

11 

12 

13class DmlplCrawler(BaseCollectionCrawler): 

14 source_name = "The Polish Digital Mathematics Library" 

15 source_domain = "DMLPL" 

16 source_website = "http://pldml.icm.edu.pl/pldml" 

17 

18 # HACK : Workaround for tests (monkeypatching) 

19 # We store the class here, so we can monkeypatch it when running tests 

20 subCrawlers = {LofplCrawler: None} 

21 

22 def parse_collection_content(self, content): 

23 """ 

24 Parse the HTML page of Annals of Math and returns a list of xissue. 

25 Each xissue has its pid/volume/number/year metadata + its url 

26 """ 

27 issues = [] 

28 data = json.loads(content) 

29 for entry in data: 

30 link = self.source_website + "/tree/hierarchy.action" 

31 params = {"root": entry["id"]} 

32 link += "?" + parse.urlencode(params) 

33 

34 text: str = entry["text"] 

35 if not text.startswith("tom/rocznik"): 

36 raise ValueError( 

37 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string' 

38 ) 

39 soup = BeautifulSoup(text, "html.parser") 

40 a_tags = soup.select("a") 

41 if len(a_tags) < 2: 

42 raise ValueError("Cannot parse Collection : couldn't find volume information") 

43 volume = a_tags[0].text 

44 year = a_tags[1].text 

45 

46 issues.extend(self.parse_dmlpl_volume_content(link, year, volume)) 

47 return issues 

48 

49 def parse_dmlpl_volume_content(self, link, year, volume): 

50 content = self.download_file(link) 

51 has_articles = False 

52 issues = [] 

53 data = json.loads(content) 

54 for entry in data: 

55 entry_link = self.source_website + "/tree/hierarchy.action" 

56 params = {"root": entry["id"]} 

57 entry_link += "?" + parse.urlencode(params) 

58 

59 number = None 

60 text: str = entry["text"] 

61 if text.startswith("numer"): 

62 soup = BeautifulSoup(text, "html.parser") 

63 a_tag = soup.select_one("a") 

64 if not a_tag: 

65 raise ValueError("Cannot parse Collection : couldn't find issue information") 

66 number = a_tag.text.replace(" ", "_") 

67 issues.append(self.create_xissue(entry_link, year, volume, number)) 

68 elif text.startswith("artykuł"): 

69 has_articles = True 

70 

71 if has_articles: 

72 issues.append(self.create_xissue(link, year, volume)) 

73 

74 return issues 

75 

76 def parse_issue_content(self, content, xissue): 

77 data = json.loads(content) 

78 for index, entry in enumerate(data): 

79 xarticle = create_articledata() 

80 xarticle.pid = "a" + str(index) 

81 xarticle.url = self.source_website + "/element/" + entry["id"] 

82 xissue.articles.append(xarticle) 

83 

84 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki) 

85 def crawl_article(self, xarticle, xissue): 

86 parsed_xarticle = xarticle 

87 if hasattr(xarticle, "url") and xarticle.url: 

88 response = self._get(xarticle.url) 

89 

90 # Crawl using LOFPL if detected 

91 if response.url.startswith("https://bibliotekanauki.pl"): 

92 xarticle.url = response.url.replace( 

93 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api" 

94 ) 

95 targetCrawler = self.subCrawlers[LofplCrawler] 

96 if targetCrawler is None: 

97 raise ValueError("Crawler incorrectly initialized") 

98 parsed_xarticle = targetCrawler.crawl_article(xarticle, xissue) 

99 elif response.url.startswith("http://pldml.icm.edu.pl"): 

100 parsed_xarticle = super().crawl_article(xarticle, xissue) 

101 else: 

102 raise NotImplementedError 

103 

104 if not parsed_xarticle: 

105 raise ValueError("Couldn't crawl article") 

106 # The article title may have formulas surrounded with '$' 

107 return self.process_article_metadata(parsed_xarticle) 

108 

109 def parse_dmlpl_generic_page(self, content: str): 

110 soup = BeautifulSoup(content, "html.parser") 

111 main = soup.select_one("div.details-content") 

112 if not main: 

113 raise ValueError("Cannot parse article : main div not found") 

114 

115 sections = main.select("div.row") 

116 sections_dict: dict[str, Tag] = {} 

117 for s in sections: 

118 row_label = s.select_one("div.row-label") 

119 if not row_label: 

120 raise ValueError("Cannot parse article : row label not found") 

121 tag = s.select_one("div.row-desc") 

122 if tag: 

123 sections_dict[row_label.text] = tag 

124 

125 return sections_dict 

126 

127 def parse_article_content(self, content, xissue, xarticle, url): 

128 sections_dict = self.parse_dmlpl_generic_page(content) 

129 

130 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text) 

131 

132 # Author 

133 for a_tag in sections_dict["Autorzy"].select("a"): 

134 href = a_tag.get("href") 

135 if not isinstance(href, str): 

136 raise ValueError("author href is not a string") 

137 author = self.parse_author(self.download_file(self.source_website + "/" + href)) 

138 author["role"] = "author" 

139 xarticle.contributors.append(author) 

140 

141 # TODO : Contributor ? (Twórcy) 

142 

143 # PDF 

144 if "Treść / Zawartość" in sections_dict: 

145 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a") 

146 if not pdf_a_tag: 

147 raise ValueError("Cannot find pdf for article") 

148 pdf_url = pdf_a_tag.get("href") 

149 if not isinstance(pdf_url, str): 

150 raise ValueError("Cannot parse pdf url for article") 

151 if not pdf_url.startswith("http"): 

152 pdf_url = self.source_website + "/" + pdf_url 

153 add_pdf_link_to_xarticle(xarticle, pdf_url) 

154 else: 

155 self.logger.info("PDF not found", extra={"pid": xarticle.pid}) 

156 

157 # Lang 

158 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower()) 

159 if len(xarticle.lang) > 3: 

160 if xarticle.lang == "pl fr": 

161 xarticle.lang = "pl" 

162 self.logger.info( 

163 f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)", 

164 extra={"pid": xarticle.pid}, 

165 ) 

166 else: 

167 raise ValueError("Cannot parse article lang") 

168 

169 # Abstract 

170 if "Abstrakty" in sections_dict: 

171 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row") 

172 for div in abstract_divs: 

173 lang = "und" 

174 lang_div = div.select_one("div.articleDetails-langCell") 

175 if lang_div: 

176 lang = cleanup_str(lang_div.text).lower() 

177 text_div = div.select_one("div.articleDetails-abstract") 

178 if not text_div: 

179 raise ValueError( 

180 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed" 

181 ) 

182 abstract_text = cleanup_str(text_div.text) 

183 if abstract_text != "-": 

184 xarticle.abstracts.append(create_abstract(value_tex=abstract_text, lang=lang)) 

185 

186 # Keywords 

187 if "Słowa kluczowe" in sections_dict: 

188 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row") 

189 for list in keywords_lists: 

190 lang = "und" 

191 lang_div = list.select_one("div.articleDetails-langCell") 

192 keywords_a_tags = list.select("a") 

193 for a_tag in keywords_a_tags: 

194 subject = create_subj() 

195 subject["value"] = a_tag.text 

196 subject["lang"] = lang 

197 xarticle.kwds.append(subject) 

198 # Page 

199 if "Strony" in sections_dict: 

200 set_pages(xarticle, cleanup_str(sections_dict["Strony"].text)) 

201 

202 return xarticle 

203 

204 def parse_author(self, content: str): 

205 author = create_contributor() 

206 sections_dict = self.parse_dmlpl_generic_page(content) 

207 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text) 

208 author["first_name"] = cleanup_str(sections_dict["Imię"].text) 

209 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0: 

210 author["string_name"] = cleanup_str(sections_dict["Twórca"].text) 

211 return author