Coverage for src/crawler/by_source/dmlcz_crawler.py: 80%

133 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import re 

2 

3from bs4 import BeautifulSoup 

4from bs4 import Tag 

5from crawler.base_crawler import BaseCollectionCrawler 

6from crawler.base_crawler import add_pdf_link_to_xarticle 

7from crawler.crawler_types import CitationLiteral 

8 

9from ptf.model_data import create_articledata 

10from ptf.model_data import create_issuedata 

11from ptf.model_data import create_subj 

12 

13 

14class DmlczCrawler(BaseCollectionCrawler): 

15 source_name = "Czech Digital Mathematics Library" 

16 source_domain = "DMLCZ" 

17 source_website = "https://dml.cz" 

18 

19 def __init__(self, *args, **kwargs): 

20 super().__init__(*args, **kwargs) 

21 

22 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue. 

23 

24 self.source = self.get_or_create_source() 

25 

26 self.issue_href = r"/handle/\d+.dmlcz/\d+" 

27 

28 def parse_collection_content(self, content): 

29 """ 

30 Parse the HTML page of Annals of Math and returns a list of xissue. 

31 Each xissue has its pid/volume/number/year metadata + its url 

32 

33 self.periode is set at the end based on the xissue years of the HTML page 

34 """ 

35 soup = BeautifulSoup(content, "html.parser") 

36 xissues = [] 

37 

38 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})] 

39 

40 for issue_node in issue_nodes: 

41 reg_year = re.compile(r"\d{4}") 

42 reg_volume = re.compile(r"Volume \d+") 

43 issue_text = issue_node.get_text() 

44 if re.compile(r"\d+").search(issue_text): 44 ↛ 40line 44 didn't jump to line 40 because the condition on line 44 was always true

45 elem = issue_node.find("a") 

46 dates = reg_year.search(issue_text) 

47 volume = reg_volume.search(elem.get_text()) 

48 issues = issue_node.findNext("td") 

49 issues = issues.findAll("a") 

50 if volume: 50 ↛ 52line 50 didn't jump to line 52 because the condition on line 50 was always true

51 volume = volume[0].replace("Volume ", "") 

52 if dates: 52 ↛ 56line 52 didn't jump to line 56 because the condition on line 52 was always true

53 search = reg_year.search(issue_text) 

54 if search is not None: 54 ↛ 56line 54 didn't jump to line 56 because the condition on line 54 was always true

55 dates = search[0] 

56 for issue in issues: 

57 link = issue.get("href") 

58 number = issue.get_text() 

59 xissue = self.create_xissue(link, volume, number, dates) 

60 if xissue: 60 ↛ 56line 60 didn't jump to line 56 because the condition on line 60 was always true

61 xissues.append(xissue) 

62 

63 self.periode_begin = self.get_year(xissues[0].year) 

64 self.periode_end = self.get_year(xissues[-1].year) 

65 self.periode = self.get_or_create_periode() 

66 

67 return xissues 

68 

69 def get_year(self, year): 

70 if "/" in year: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 year = year.split("/")[0] 

72 

73 return year 

74 

75 def create_xissue(self, url, volume, number, dates): 

76 year = dates.replace("/", "-") 

77 

78 # volume might not be an integer. eLibM puts special issue titles as volume number. 

79 

80 try: 

81 volume_for_pid = int(volume) 

82 except ValueError: 

83 print("error parsing volume") 

84 

85 xissue = create_issuedata() 

86 number = number.replace(",", "-") 

87 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}" 

88 xissue.year = year 

89 xissue.volume = volume 

90 xissue.number = number 

91 xissue.url = self.source_website + url 

92 

93 return xissue 

94 

95 def parse_issue_content(self, content, xissue): 

96 soup = BeautifulSoup(content, "html.parser") 

97 article_nodes = soup.find_all("td", {"class": "article"}) 

98 

99 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887) 

100 # We need to ignore the articles already crawled 

101 article_urls = [] 

102 

103 for index_article, article_node in enumerate(article_nodes): 

104 article_link_node = article_node.find("a") 

105 if article_link_node: 105 ↛ 103line 105 didn't jump to line 103 because the condition on line 105 was always true

106 url = article_link_node.get("href") 

107 if url not in article_urls: 107 ↛ 103line 107 didn't jump to line 103 because the condition on line 107 was always true

108 article_urls.append(url) 

109 

110 xarticle = create_articledata() 

111 xarticle.pid = "a" + str(index_article) 

112 xarticle.url = self.source_website + url 

113 

114 xissue.articles.append(xarticle) 

115 

116 def parse_article_content(self, content, xissue, xarticle, url, pid): 

117 """ 

118 Parse the content with Beautifulsoup and returns an ArticleData 

119 """ 

120 xarticle = create_articledata() 

121 xarticle.pid = pid 

122 xarticle.lang = "en" 

123 

124 soup = BeautifulSoup(content, "html.parser") 

125 bloc_ref_ids = soup.find("div", {"class": "item-refids"}) 

126 # TITLE 

127 title_node = soup.find("span", {"class": "item-title"}) 

128 if title_node: 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was always true

129 xarticle.title_tex = title_node.get_text() 

130 

131 # ABSTRACT 

132 abstract_section_node = soup.find("dim:field") 

133 if abstract_section_node: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 abstract = str(abstract_section_node.get_text()) 

135 xabstract = { 

136 "tag": "abstract", 

137 "value_html": "", 

138 "value_tex": abstract, 

139 "value_xml": "", 

140 "lang": "en", 

141 } 

142 xarticle.abstracts.append(xabstract) 

143 

144 # PDF 

145 link_nodes = soup.find_all("a") 

146 for link_node in link_nodes: 

147 pdf_url = link_node.get("href") 

148 if pdf_url.startswith("/bitstream/"): 

149 add_pdf_link_to_xarticle(xarticle, pdf_url) 

150 reg_msc = re.compile("/browse-subject") 

151 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))] 

152 

153 # MSC 

154 for subj in subjs_nodes: 154 ↛ 155line 154 didn't jump to line 155 because the loop on line 154 never started

155 subject = create_subj() 

156 subject["value"] = subj 

157 subject["type"] = "msc" 

158 subject["lang"] = "en" 

159 xarticle.kwds.append(subject) 

160 

161 # PAGES 

162 pages = soup.find("span", {"class": "item-pp"}) 

163 if pages: 163 ↛ 181line 163 didn't jump to line 181 because the condition on line 163 was always true

164 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

165 if pages_to: 165 ↛ 181line 165 didn't jump to line 181 because the condition on line 165 was always true

166 parts = pages_to[0].split("-") 

167 first_page = parts[0].replace("(", "").replace(")", "") 

168 if len(parts) > 1: 168 ↛ 172line 168 didn't jump to line 172 because the condition on line 168 was always true

169 last_page = parts[1].replace("(", "").replace(")", "") 

170 xarticle.lpage = last_page 

171 

172 xarticle.fpage = first_page 

173 

174 # Biblio 

175 # bibitems_tags = soup.select("div.references-inside div.reference") 

176 # bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags] 

177 # if len(bibitems) > 0: 

178 # xarticle.abstracts.append(self.create_bibliography(bibitems)) 

179 

180 # DOI 

181 reg_doi = re.compile("dx.doi.org") 

182 

183 what: list[CitationLiteral] = [ 

184 "lang", 

185 "title", 

186 "author", 

187 "pdf", 

188 "abstract", 

189 "page", 

190 "mr", 

191 "zbl", 

192 "publisher", 

193 "keywords", 

194 ] 

195 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

196 

197 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 197 ↛ 216line 197 didn't jump to line 216 because the condition on line 197 was always true

198 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))] 

199 if len(doi_node) > 0: 199 ↛ 216line 199 didn't jump to line 216 because the condition on line 199 was always true

200 doi = doi_node[0].get_text() 

201 pos = doi.find("10.") 

202 if pos > 0: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true

203 doi = doi[pos:] 

204 xarticle.doi = doi 

205 

206 # fix wrong doi attribution for article a14 of volume 62 number 1 

207 # 10.1007/s10587-012-0005-x: 

208 if xarticle.pid in ["CMJ_2012__62_1_a14", "ZCSUT_2012__22_3_a3"]: 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true

209 xarticle.doi = None 

210 else: 

211 xarticle.pid = ( 

212 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_") 

213 ) 

214 

215 # Hack to handle articles with no titles 

216 if not xarticle.title_tex: 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true

217 xarticle.title_tex = " " 

218 

219 return xarticle