Coverage for src/crawler/by_source/dmlcz_crawler.py: 77%

104 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1import re 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import create_abstract, create_articledata, create_subj 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7 

8 

9class DmlczCrawler(BaseCollectionCrawler): 

10 source_name = "Czech Digital Mathematics Library" 

11 source_domain = "DMLCZ" 

12 source_website = "https://dml.cz" 

13 

14 issue_href = r"/handle/\d+.dmlcz/\d+" 

15 

16 def parse_collection_content(self, content): 

17 """ 

18 Parse the HTML page of Annals of Math and returns a list of xissue. 

19 Each xissue has its pid/volume/number/year metadata + its url 

20 """ 

21 soup = BeautifulSoup(content, "html.parser") 

22 xissues = [] 

23 

24 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})] 

25 

26 for issue_node in issue_nodes: 

27 reg_year = re.compile(r"\d{4}") 

28 reg_volume = re.compile(r"Volume \d+") 

29 issue_text = issue_node.get_text() 

30 if re.compile(r"\d+").search(issue_text): 30 ↛ 26line 30 didn't jump to line 26 because the condition on line 30 was always true

31 elem = issue_node.find("a") 

32 dates = reg_year.search(issue_text) 

33 volume = reg_volume.search(elem.get_text()) 

34 issues = issue_node.findNext("td") 

35 issues = issues.findAll("a") 

36 if volume: 36 ↛ 38line 36 didn't jump to line 38 because the condition on line 36 was always true

37 volume = volume[0].replace("Volume ", "") 

38 if dates: 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true

39 search = reg_year.search(issue_text) 

40 if search is not None: 40 ↛ 42line 40 didn't jump to line 42 because the condition on line 40 was always true

41 dates = search[0] 

42 for issue in issues: 

43 link = issue.get("href") 

44 number = issue.get_text() 

45 xissue = self.create_dmlcz_xissue(link, volume, number, dates) 

46 if xissue: 46 ↛ 42line 46 didn't jump to line 42 because the condition on line 46 was always true

47 xissues.append(xissue) 

48 

49 return xissues 

50 

51 def get_year(self, year): 

52 if "/" in year: 

53 year = year.split("/")[0] 

54 

55 return year 

56 

57 def create_dmlcz_xissue(self, url, volume_str: str, number, dates): 

58 year = dates.replace("/", "-") 

59 number = number.replace(",", "-") 

60 

61 volume = volume_str 

62 if not volume.isnumeric(): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 volume = None 

64 self.logger.debug("Couldn't parse volume string", extra={"url": url}) 

65 

66 xissue = super().create_xissue(self.source_website + url, year, volume, number) 

67 

68 return xissue 

69 

70 def parse_issue_content(self, content, xissue): 

71 soup = BeautifulSoup(content, "html.parser") 

72 article_nodes = soup.find_all("td", {"class": "article"}) 

73 

74 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887) 

75 # We need to ignore the articles already crawled 

76 article_urls = [] 

77 

78 for index_article, article_node in enumerate(article_nodes): 

79 article_link_node = article_node.find("a") 

80 if article_link_node: 80 ↛ 78line 80 didn't jump to line 78 because the condition on line 80 was always true

81 url = article_link_node.get("href") 

82 if url not in article_urls: 82 ↛ 78line 82 didn't jump to line 78 because the condition on line 82 was always true

83 article_urls.append(url) 

84 

85 xarticle = create_articledata() 

86 xarticle.pid = "a" + str(index_article) 

87 xarticle.url = self.source_website + url 

88 

89 xissue.articles.append(xarticle) 

90 

91 def parse_article_content(self, content, xissue, xarticle, url): 

92 """ 

93 Parse the content with Beautifulsoup and returns an ArticleData 

94 """ 

95 soup = BeautifulSoup(content, "html.parser") 

96 self.get_metadata_using_citation_meta( 

97 xarticle, 

98 xissue, 

99 soup, 

100 [ 

101 "lang", 

102 "title", 

103 "author", 

104 "pdf", 

105 "abstract", 

106 "page", 

107 "mr", 

108 "zbl", 

109 "publisher", 

110 "keywords", 

111 ], 

112 ) 

113 

114 bloc_ref_ids = soup.find("div", {"class": "item-refids"}) 

115 # TITLE 

116 title_node = soup.find("span", {"class": "item-title"}) 

117 if title_node: 117 ↛ 121line 117 didn't jump to line 121 because the condition on line 117 was always true

118 xarticle.title_tex = title_node.get_text() 

119 

120 # ABSTRACT 

121 abstract_section_node = soup.find("dim:field") 

122 if abstract_section_node: 

123 abstract = str(abstract_section_node.get_text()) 

124 

125 xarticle.abstracts.append( 

126 create_abstract( 

127 value_tex=abstract, 

128 lang=xarticle.lang, 

129 ) 

130 ) 

131 

132 # PDF 

133 # link_nodes = soup.find_all("a") 

134 # for link_node in link_nodes: 

135 # pdf_url = link_node.get("href") 

136 # if pdf_url.startswith("/bitstream/"): 

137 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

138 reg_msc = re.compile("/browse-subject") 

139 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))] 

140 

141 # MSC 

142 for subj in subjs_nodes: 

143 subject = create_subj(value=subj, type="msc", lang=xarticle.lang) 

144 xarticle.kwds.append(subject) 

145 

146 # PAGES 

147 pages = soup.find("span", {"class": "item-pp"}) 

148 if pages: 148 ↛ 166line 148 didn't jump to line 166 because the condition on line 148 was always true

149 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

150 if pages_to: 150 ↛ 166line 150 didn't jump to line 166 because the condition on line 150 was always true

151 parts = pages_to[0].split("-") 

152 first_page = parts[0].replace("(", "").replace(")", "") 

153 if len(parts) > 1: 153 ↛ 157line 153 didn't jump to line 157 because the condition on line 153 was always true

154 last_page = parts[1].replace("(", "").replace(")", "") 

155 xarticle.lpage = last_page 

156 

157 xarticle.fpage = first_page 

158 

159 # Biblio 

160 # bibitems_tags = soup.select("div.references-inside div.reference") 

161 # bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags] 

162 # if len(bibitems) > 0: 

163 # xarticle.abstracts.append(self.create_bibliography(bibitems)) 

164 

165 # DOI 

166 reg_doi = re.compile("dx.doi.org") 

167 

168 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 168 ↛ 187line 168 didn't jump to line 187 because the condition on line 168 was always true

169 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))] 

170 if len(doi_node) > 0: 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true

171 doi = doi_node[0].get_text() 

172 pos = doi.find("10.") 

173 if pos > 0: 

174 doi = doi[pos:] 

175 xarticle.doi = doi 

176 

177 # fix wrong doi attribution for article a14 of volume 62 number 1 

178 # 10.1007/s10587-012-0005-x: 

179 if xarticle.pid in ["CMJ_2012_62_1_a14", "ZCSUT_2012_22_3_a3"]: 

180 xarticle.doi = None 

181 else: 

182 xarticle.pid = ( 

183 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_") 

184 ) 

185 

186 # Hack to handle articles with no titles 

187 if not xarticle.title_tex: 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 xarticle.title_tex = " " 

189 

190 return xarticle