Coverage for src/crawler/by_source/dmlcz

1import re

3from bs4 import BeautifulSoup, Tag

4from ptf.model_data import create_abstract, create_articledata, create_subj

6from crawler.base_crawler import BaseCollectionCrawler

9class DmlczCrawler(BaseCollectionCrawler):

10 source_name = "Czech Digital Mathematics Library"

11 source_domain = "DMLCZ"

12 source_website = "https://dml.cz"

14 issue_href = r"/handle/\d+.dmlcz/\d+"

16 def parse_collection_content(self, content):

17 """

18 Parse the HTML page of Annals of Math and returns a list of xissue.

19 Each xissue has its pid/volume/number/year metadata + its url

20 """

21 soup = BeautifulSoup(content, "html.parser")

22 xissues = []

24 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})]

26 for issue_node in issue_nodes:

27 reg_year = re.compile(r"\d{4}")

28 reg_volume = re.compile(r"Volume \d+")

29 issue_text = issue_node.get_text()

30 if re.compile(r"\d+").search(issue_text): 30 ↛ 26line 30 didn't jump to line 26 because the condition on line 30 was always true

31 elem = issue_node.find("a")

32 dates = reg_year.search(issue_text)

33 volume = reg_volume.search(elem.get_text())

34 issues = issue_node.findNext("td")

35 issues = issues.findAll("a")

36 if volume: 36 ↛ 38line 36 didn't jump to line 38 because the condition on line 36 was always true

37 volume = volume[0].replace("Volume ", "")

38 if dates: 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true

39 search = reg_year.search(issue_text)

40 if search is not None: 40 ↛ 42line 40 didn't jump to line 42 because the condition on line 40 was always true

41 dates = search[0]

42 for issue in issues:

43 link = issue.get("href")

44 number = issue.get_text()

45 xissue = self.create_dmlcz_xissue(link, volume, number, dates)

46 if xissue: 46 ↛ 42line 46 didn't jump to line 42 because the condition on line 46 was always true

47 xissues.append(xissue)

49 return xissues

51 def get_year(self, year):

52 if "/" in year:

53 year = year.split("/")[0]

55 return year

57 def create_dmlcz_xissue(self, url, volume_str: str, number, dates):

58 year = dates.replace("/", "-")

59 number = number.replace(",", "-")

61 volume = volume_str

62 if not volume.isnumeric(): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 volume = None

64 self.logger.debug("Couldn't parse volume string", extra={"url": url})

66 xissue = super().create_xissue(self.source_website + url, year, volume, number)

68 return xissue

70 def parse_issue_content(self, content, xissue):

71 soup = BeautifulSoup(content, "html.parser")

72 article_nodes = soup.find_all("td", {"class": "article"})

74 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887)

75 # We need to ignore the articles already crawled

76 article_urls = []

78 for index_article, article_node in enumerate(article_nodes):

79 article_link_node = article_node.find("a")

80 if article_link_node: 80 ↛ 78line 80 didn't jump to line 78 because the condition on line 80 was always true

81 url = article_link_node.get("href")

82 if url not in article_urls: 82 ↛ 78line 82 didn't jump to line 78 because the condition on line 82 was always true

83 article_urls.append(url)

85 xarticle = create_articledata()

86 xarticle.pid = "a" + str(index_article)

87 xarticle.url = self.source_website + url

89 xissue.articles.append(xarticle)

91 def parse_article_content(self, content, xissue, xarticle, url):

92 """

93 Parse the content with Beautifulsoup and returns an ArticleData

94 """

95 soup = BeautifulSoup(content, "html.parser")

96 self.get_metadata_using_citation_meta(

97 xarticle,

98 xissue,

99 soup,

100 [

101 "lang",

102 "title",

103 "author",

104 "pdf",

105 "abstract",

106 "page",

107 "mr",

108 "zbl",

109 "publisher",

110 "keywords",

111 ],

112 )

113

114 bloc_ref_ids = soup.find("div", {"class": "item-refids"})

115 # TITLE

116 title_node = soup.find("span", {"class": "item-title"})

117 if title_node: 117 ↛ 121line 117 didn't jump to line 121 because the condition on line 117 was always true

118 xarticle.title_tex = title_node.get_text()

119

120 # ABSTRACT

121 abstract_section_node = soup.find("dim:field")

122 if abstract_section_node:

123 abstract = str(abstract_section_node.get_text())

124 xabstract = create_abstract(

125 tag="abstract",

126 value_tex=abstract,

127 lang=xarticle.lang,

128 )

129 xarticle.abstracts.append(xabstract)

130

131 # PDF

132 # link_nodes = soup.find_all("a")

133 # for link_node in link_nodes:

134 # pdf_url = link_node.get("href")

135 # if pdf_url.startswith("/bitstream/"):

136 # add_pdf_link_to_xarticle(xarticle, pdf_url)

137 reg_msc = re.compile("/browse-subject")

138 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))]

139

140 # MSC

141 for subj in subjs_nodes:

142 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)

143 xarticle.kwds.append(subject)

144

145 # PAGES

146 pages = soup.find("span", {"class": "item-pp"})

147 if pages: 147 ↛ 165line 147 didn't jump to line 165 because the condition on line 147 was always true

148 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())

149 if pages_to: 149 ↛ 165line 149 didn't jump to line 165 because the condition on line 149 was always true

150 parts = pages_to[0].split("-")

151 first_page = parts[0].replace("(", "").replace(")", "")

152 if len(parts) > 1: 152 ↛ 156line 152 didn't jump to line 156 because the condition on line 152 was always true

153 last_page = parts[1].replace("(", "").replace(")", "")

154 xarticle.lpage = last_page

155

156 xarticle.fpage = first_page

157

158 # Biblio

159 # bibitems_tags = soup.select("div.references-inside div.reference")

160 # bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags]

161 # if len(bibitems) > 0:

162 # xarticle.abstracts.append(self.create_bibliography(bibitems))

163

164 # DOI

165 reg_doi = re.compile("dx.doi.org")

166

167 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 167 ↛ 186line 167 didn't jump to line 186 because the condition on line 167 was always true

168 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))]

169 if len(doi_node) > 0: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 doi = doi_node[0].get_text()

171 pos = doi.find("10.")

172 if pos > 0:

173 doi = doi[pos:]

174 xarticle.doi = doi

175

176 # fix wrong doi attribution for article a14 of volume 62 number 1

177 # 10.1007/s10587-012-0005-x:

178 if xarticle.pid in ["CMJ_2012_62_1_a14", "ZCSUT_2012_22_3_a3"]:

179 xarticle.doi = None

180 else:

181 xarticle.pid = (

182 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_")

183 )

184

185 # Hack to handle articles with no titles

186 if not xarticle.title_tex: 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 xarticle.title_tex = " "

188

189 return xarticle

Coverage for src/crawler/by_source/dmlcz_crawler.py: 77%

105 statements