Coverage for src / crawler / by_source / dmlcz_crawler.py: 80%

133 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 10:24 +0000

1import re 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.cmds.xml.xml_utils import escape 

5from ptf.model_data import create_abstract, create_articledata, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.cmds.mixed_citation import ExtLinkXml, GenericRefElement, MixedCitation 

9 

10 

11class DmlczCrawler(BaseCollectionCrawler): 

12 source_name = "Czech Digital Mathematics Library" 

13 source_domain = "DMLCZ" 

14 source_website = "https://dml.cz" 

15 

16 issue_href = r"/handle/\d+.dmlcz/\d+" 

17 

18 def parse_collection_content(self, content): 

19 """ 

20 Parse the HTML page of Annals of Math and returns a list of xissue. 

21 Each xissue has its pid/volume/number/year metadata + its url 

22 """ 

23 soup = BeautifulSoup(content, "html.parser") 

24 xissues = [] 

25 

26 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})] 

27 

28 for issue_node in issue_nodes: 

29 reg_year = re.compile(r"\d{4}") 

30 reg_volume = re.compile(r"Volume \d+") 

31 issue_text = issue_node.get_text() 

32 if re.compile(r"\d+").search(issue_text): 32 ↛ 28line 32 didn't jump to line 28 because the condition on line 32 was always true

33 elem = issue_node.find("a") 

34 dates = reg_year.search(issue_text) 

35 volume = reg_volume.search(elem.get_text()) 

36 issues = issue_node.findNext("td") 

37 issues = issues.findAll("a") 

38 if volume: 38 ↛ 40line 38 didn't jump to line 40 because the condition on line 38 was always true

39 volume = volume[0].replace("Volume ", "") 

40 if dates: 40 ↛ 44line 40 didn't jump to line 44 because the condition on line 40 was always true

41 search = reg_year.search(issue_text) 

42 if search is not None: 42 ↛ 44line 42 didn't jump to line 44 because the condition on line 42 was always true

43 dates = search[0] 

44 for issue in issues: 

45 link = issue.get("href") 

46 number = issue.get_text() 

47 xissue = self.create_dmlcz_xissue(link, volume, number, dates) 

48 if xissue: 48 ↛ 44line 48 didn't jump to line 44 because the condition on line 48 was always true

49 xissues.append(xissue) 

50 

51 return xissues 

52 

53 def get_year(self, year): 

54 if "/" in year: 

55 year = year.split("/")[0] 

56 

57 return year 

58 

59 def create_dmlcz_xissue(self, url, volume_str: str, number, dates): 

60 year = dates.replace("/", "-") 

61 number = number.replace(",", "-") 

62 

63 volume = volume_str 

64 if not volume_str.isnumeric(): 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 volume = None 

66 self.logger.debug("Couldn't parse volume string", extra={"url": url}) 

67 else: 

68 volume = str(int(volume_str)) 

69 

70 xissue = super().create_xissue(self.source_website + url, year, volume, number) 

71 

72 return xissue 

73 

74 def parse_issue_content(self, content, xissue): 

75 soup = BeautifulSoup(content, "html.parser") 

76 article_nodes = soup.find_all("td", {"class": "article"}) 

77 

78 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887) 

79 # We need to ignore the articles already crawled 

80 article_urls = [] 

81 

82 for index_article, article_node in enumerate(article_nodes): 

83 article_link_node = article_node.find("a") 

84 if article_link_node: 84 ↛ 82line 84 didn't jump to line 82 because the condition on line 84 was always true

85 url = article_link_node.get("href") 

86 if url not in article_urls: 86 ↛ 82line 86 didn't jump to line 82 because the condition on line 86 was always true

87 article_urls.append(url) 

88 

89 xarticle = create_articledata() 

90 xarticle.pid = "a" + str(index_article) 

91 xarticle.url = self.source_website + url 

92 

93 xissue.articles.append(xarticle) 

94 

95 def parse_article_content(self, content, xissue, xarticle, url): 

96 """ 

97 Parse the content with Beautifulsoup and returns an ArticleData 

98 """ 

99 soup = BeautifulSoup(content, "html.parser") 

100 self.get_metadata_using_citation_meta( 

101 xarticle, 

102 xissue, 

103 soup, 

104 [ 

105 "lang", 

106 "title", 

107 "author", 

108 "pdf", 

109 "abstract", 

110 "page", 

111 "mr", 

112 "zbl", 

113 "publisher", 

114 "keywords", 

115 ], 

116 ) 

117 

118 bloc_ref_ids = soup.find("div", {"class": "item-refids"}) 

119 # TITLE 

120 title_node = soup.find("span", {"class": "item-title"}) 

121 if title_node: 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was always true

122 xarticle.title_tex = title_node.get_text() 

123 

124 # ABSTRACT 

125 abstract_section_node = soup.find("dim:field") 

126 if abstract_section_node: 

127 abstract = str(abstract_section_node.get_text()) 

128 

129 xarticle.abstracts.append( 

130 create_abstract( 

131 value_tex=abstract, 

132 lang=xarticle.lang, 

133 ) 

134 ) 

135 

136 # PDF 

137 # link_nodes = soup.find_all("a") 

138 # for link_node in link_nodes: 

139 # pdf_url = link_node.get("href") 

140 # if pdf_url.startswith("/bitstream/"): 

141 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

142 reg_msc = re.compile("/browse-subject") 

143 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))] 

144 

145 # MSC 

146 for subj in subjs_nodes: 

147 subject = create_subj(value=subj, type="msc", lang=xarticle.lang) 

148 xarticle.kwds.append(subject) 

149 

150 # PAGES 

151 pages = soup.find("span", {"class": "item-pp"}) 

152 if pages: 152 ↛ 164line 152 didn't jump to line 164 because the condition on line 152 was always true

153 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

154 if pages_to: 154 ↛ 164line 154 didn't jump to line 164 because the condition on line 154 was always true

155 parts = pages_to[0].split("-") 

156 first_page = parts[0].replace("(", "").replace(")", "") 

157 if len(parts) > 1: 157 ↛ 161line 157 didn't jump to line 161 because the condition on line 157 was always true

158 last_page = parts[1].replace("(", "").replace(")", "") 

159 xarticle.lpage = last_page 

160 

161 xarticle.fpage = first_page 

162 

163 # Biblio 

164 bibitems_tags = soup.select("div.references-inside div.reference") 

165 bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags] 

166 xarticle.bibitems = bibitems 

167 

168 # DOI 

169 reg_doi = re.compile("dx.doi.org") 

170 

171 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 171 ↛ 190line 171 didn't jump to line 190 because the condition on line 171 was always true

172 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))] 

173 if len(doi_node) > 0: 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 doi = doi_node[0].get_text() 

175 pos = doi.find("10.") 

176 if pos > 0: 

177 doi = doi[pos:] 

178 xarticle.doi = doi 

179 

180 # fix wrong doi attribution for article a14 of volume 62 number 1 

181 # 10.1007/s10587-012-0005-x: 

182 if xarticle.pid in ["CMJ_2012_62_1_a14", "ZCSUT_2012_22_3_a3"]: 

183 xarticle.doi = None 

184 else: 

185 xarticle.pid = ( 

186 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_") 

187 ) 

188 

189 # Hack to handle articles with no titles 

190 if not xarticle.title_tex: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 xarticle.title_tex = " " 

192 

193 return xarticle 

194 

195 def parse_bibitem_tag(self, tag: Tag): 

196 citation_builder = MixedCitation() 

197 for child in tag.children: 

198 if isinstance(child, str): 

199 if child.strip() == "|": 

200 continue 

201 citation_builder.elements.append(child) 

202 continue 

203 if isinstance(child, Tag): 203 ↛ 197line 203 didn't jump to line 197 because the condition on line 203 was always true

204 if child.name == "b": 

205 el = GenericRefElement() 

206 el.name = "article-title" 

207 el.elements.append(child.text) 

208 citation_builder.elements.append(el) 

209 continue 

210 if child.name == "a": 210 ↛ 197line 210 didn't jump to line 197 because the condition on line 210 was always true

211 href = child.get("href") 

212 if not isinstance(href, str): 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 continue 

214 el = ExtLinkXml(escape(href), escape(child.text)) 

215 citation_builder.elements.append(el) 

216 continue 

217 

218 return citation_builder.get_jats_ref()