Coverage for src/crawler/by_source/dmlcz_crawler.py: 74%

111 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import create_abstract, create_articledata, create_subj 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7 

8 

9class DmlczCrawler(BaseCollectionCrawler): 

10 source_name = "Czech Digital Mathematics Library" 

11 source_domain = "DMLCZ" 

12 source_website = "https://dml.cz" 

13 

14 issue_href = r"/handle/\d+.dmlcz/\d+" 

15 

16 def parse_collection_content(self, content): 

17 """ 

18 Parse the HTML page of Annals of Math and returns a list of xissue. 

19 Each xissue has its pid/volume/number/year metadata + its url 

20 

21 self.periode is set at the end based on the xissue years of the HTML page 

22 """ 

23 soup = BeautifulSoup(content, "html.parser") 

24 xissues = [] 

25 

26 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})] 

27 

28 for issue_node in issue_nodes: 

29 reg_year = re.compile(r"\d{4}") 

30 reg_volume = re.compile(r"Volume \d+") 

31 issue_text = issue_node.get_text() 

32 if re.compile(r"\d+").search(issue_text): 32 ↛ 28line 32 didn't jump to line 28 because the condition on line 32 was always true

33 elem = issue_node.find("a") 

34 dates = reg_year.search(issue_text) 

35 volume = reg_volume.search(elem.get_text()) 

36 issues = issue_node.findNext("td") 

37 issues = issues.findAll("a") 

38 if volume: 38 ↛ 40line 38 didn't jump to line 40 because the condition on line 38 was always true

39 volume = volume[0].replace("Volume ", "") 

40 if dates: 40 ↛ 44line 40 didn't jump to line 44 because the condition on line 40 was always true

41 search = reg_year.search(issue_text) 

42 if search is not None: 42 ↛ 44line 42 didn't jump to line 44 because the condition on line 42 was always true

43 dates = search[0] 

44 for issue in issues: 

45 link = issue.get("href") 

46 number = issue.get_text() 

47 xissue = self.create_dmlcz_xissue(link, volume, number, dates) 

48 if xissue: 48 ↛ 44line 48 didn't jump to line 44 because the condition on line 48 was always true

49 xissues.append(xissue) 

50 

51 self.periode_begin = self.get_year(xissues[0].year) 

52 self.periode_end = self.get_year(xissues[-1].year) 

53 self.periode = self.get_or_create_periode() 

54 

55 return xissues 

56 

57 def get_year(self, year): 

58 if "/" in year: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 year = year.split("/")[0] 

60 

61 return year 

62 

63 def create_dmlcz_xissue(self, url, volume, number, dates): 

64 year = dates.replace("/", "-") 

65 number = number.replace(",", "-") 

66 

67 # volume might not be an integer. eLibM puts special issue titles as volume number. 

68 

69 try: 

70 volume_for_pid = int(volume) 

71 except ValueError: 

72 print("error parsing volume") 

73 

74 xissue = super().create_xissue(self.source_website + url, year, volume, number) 

75 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}" 

76 

77 return xissue 

78 

79 def parse_issue_content(self, content, xissue): 

80 soup = BeautifulSoup(content, "html.parser") 

81 article_nodes = soup.find_all("td", {"class": "article"}) 

82 

83 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887) 

84 # We need to ignore the articles already crawled 

85 article_urls = [] 

86 

87 for index_article, article_node in enumerate(article_nodes): 

88 article_link_node = article_node.find("a") 

89 if article_link_node: 89 ↛ 87line 89 didn't jump to line 87 because the condition on line 89 was always true

90 url = article_link_node.get("href") 

91 if url not in article_urls: 91 ↛ 87line 91 didn't jump to line 87 because the condition on line 91 was always true

92 article_urls.append(url) 

93 

94 xarticle = create_articledata() 

95 xarticle.pid = "a" + str(index_article) 

96 xarticle.url = self.source_website + url 

97 

98 xissue.articles.append(xarticle) 

99 

100 def parse_article_content(self, content, xissue, xarticle, url, pid): 

101 """ 

102 Parse the content with Beautifulsoup and returns an ArticleData 

103 """ 

104 xarticle = create_articledata() 

105 xarticle.pid = pid 

106 soup = BeautifulSoup(content, "html.parser") 

107 self.get_metadata_using_citation_meta( 

108 xarticle, 

109 xissue, 

110 soup, 

111 [ 

112 "lang", 

113 "title", 

114 "author", 

115 "pdf", 

116 "abstract", 

117 "page", 

118 "mr", 

119 "zbl", 

120 "publisher", 

121 "keywords", 

122 ], 

123 ) 

124 

125 bloc_ref_ids = soup.find("div", {"class": "item-refids"}) 

126 # TITLE 

127 title_node = soup.find("span", {"class": "item-title"}) 

128 if title_node: 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was always true

129 xarticle.title_tex = title_node.get_text() 

130 

131 # ABSTRACT 

132 abstract_section_node = soup.find("dim:field") 

133 if abstract_section_node: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 abstract = str(abstract_section_node.get_text()) 

135 xabstract = create_abstract( 

136 tag="abstract", 

137 value_tex=abstract, 

138 lang=xarticle.lang, 

139 ) 

140 xarticle.abstracts.append(xabstract) 

141 

142 # PDF 

143 # link_nodes = soup.find_all("a") 

144 # for link_node in link_nodes: 

145 # pdf_url = link_node.get("href") 

146 # if pdf_url.startswith("/bitstream/"): 

147 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

148 reg_msc = re.compile("/browse-subject") 

149 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))] 

150 

151 # MSC 

152 for subj in subjs_nodes: 152 ↛ 153line 152 didn't jump to line 153 because the loop on line 152 never started

153 subject = create_subj(value=subj, type="msc", lang=xarticle.lang) 

154 xarticle.kwds.append(subject) 

155 

156 # PAGES 

157 pages = soup.find("span", {"class": "item-pp"}) 

158 if pages: 158 ↛ 176line 158 didn't jump to line 176 because the condition on line 158 was always true

159 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

160 if pages_to: 160 ↛ 176line 160 didn't jump to line 176 because the condition on line 160 was always true

161 parts = pages_to[0].split("-") 

162 first_page = parts[0].replace("(", "").replace(")", "") 

163 if len(parts) > 1: 163 ↛ 167line 163 didn't jump to line 167 because the condition on line 163 was always true

164 last_page = parts[1].replace("(", "").replace(")", "") 

165 xarticle.lpage = last_page 

166 

167 xarticle.fpage = first_page 

168 

169 # Biblio 

170 # bibitems_tags = soup.select("div.references-inside div.reference") 

171 # bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags] 

172 # if len(bibitems) > 0: 

173 # xarticle.abstracts.append(self.create_bibliography(bibitems)) 

174 

175 # DOI 

176 reg_doi = re.compile("dx.doi.org") 

177 

178 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))] 

180 if len(doi_node) > 0: 

181 doi = doi_node[0].get_text() 

182 pos = doi.find("10.") 

183 if pos > 0: 

184 doi = doi[pos:] 

185 xarticle.doi = doi 

186 

187 # fix wrong doi attribution for article a14 of volume 62 number 1 

188 # 10.1007/s10587-012-0005-x: 

189 if xarticle.pid in ["CMJ_2012__62_1_a14", "ZCSUT_2012__22_3_a3"]: 

190 xarticle.doi = None 

191 else: 

192 xarticle.pid = ( 

193 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_") 

194 ) 

195 

196 # Hack to handle articles with no titles 

197 if not xarticle.title_tex: 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true

198 xarticle.title_tex = " " 

199 

200 return xarticle