Coverage for src/crawler/by_source/dml_e

1import regex

2from bs4 import BeautifulSoup, Tag

3from ptf.model_data import ArticleData, IssueData, create_articledata, create_contributor

5from crawler.base_crawler import BaseCollectionCrawler

6from crawler.utils import add_pdf_link_to_xarticle

9class Dml_eCrawler(BaseCollectionCrawler):

10 """

11 DML_E is quite peculiar :

12 There is no issue page, and articles are separated into "years" instead of volumes/issues.

13 volume/issue number is stored inside each article page.

14 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues.

15 """

17 source_domain = "DML_E"

18 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas "

19 source_website = "http://dmle.icmat.es/revistas/"

21 periode_begin = 1968

22 periode_begin = 1969

24 # 1987, 1: 1-17

25 # 1999,19: 1-11

26 # 2008, 53-62,

27 # 1963 (1-2):

28 # 2000, 51 (1): 49-58, 13 Ref.

29 # 2006, 57 (Extra): 327-342, 10 Ref.

30 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:$(?P<number>[\d\w\-]+)$)?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?"

32 requests_interval = 60

34 def parse_collection_content(self, content):

35 xissues = []

36 soup = BeautifulSoup(content, "html.parser")

37 pagination_elements = soup.select("div.prevnext a")

38 for page in pagination_elements:

39 href = page.get("href")

40 if not isinstance(href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 continue

42 href = self.source_website + href

43 content = self.download_file(href)

44 xissues = [*xissues, *self.parse_collection_page(content, href)]

46 years = [int(issue.year) for issue in xissues]

47 self.periode_begin = min(years)

48 self.periode_end = max(years)

49 self.periode = self.get_or_create_periode()

50 return xissues

52 def parse_collection_page(self, content: str, url: str):

53 soup = BeautifulSoup(content, "html.parser")

54 xissues = []

55 current_year = False

56 issues_tags = soup.select("a[name], ul.art_info")

57 for issue_tag in issues_tags:

58 if issue_tag.name == "a":

59 current_year = issue_tag.get("name")

60 if not isinstance(current_year, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Issue year cannot be parsed")

62 continue

64 if not current_year: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 raise ValueError("Issue year not found")

66 issue = self.create_xissue(url, current_year, current_year)

67 self.parse_issue_tag(issue_tag, issue)

68 xissues.append(issue)

69 return xissues

71 # def parse_issue_content(self, content, xissue):

72 # pass

74 def parse_issue_tag(self, tag: Tag, xissue: IssueData):

75 article_tags = tag.select("li")

76 for index, art_tag in enumerate(article_tags):

77 href_tag = art_tag.select_one("a[href]")

78 if not href_tag: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 raise ValueError("Cannot parse article")

80 url = href_tag.get("href")

81 if not isinstance(url, str): 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 raise ValueError("Cannot parse Article URL")

83 url = self.source_website + url

85 title = href_tag.text

87 article = create_articledata()

88 article.title_tex = title

89 article.url = url

90 article.pid = "a" + str(index)

91 xissue.articles.append(article)

93 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid):

94 xarticle.pid = pid

95 soup = BeautifulSoup(content, "html.parser")

96 table_lines = soup.select("div#centro table tr")

97 issue_volume: str | None = None

98 issue_number: str | None = None

99 for line in table_lines:

100 header_tag = line.select_one("th")

101 value_tag = line.select_one("td")

102 if not value_tag:

103 raise ValueError("Cannot parse article")

104

105 # PDF

106 if not header_tag:

107 href_tag = line.select_one("a")

108 if not href_tag:

109 raise ValueError("Cannot parse article pdf link")

110 href = href_tag.get("href")

111 if not isinstance(href, str):

112 raise ValueError("Cannot parse article pdf link")

113 add_pdf_link_to_xarticle(xarticle, self.source_website + href)

114 continue

115

116 # Title

117 if header_tag.text == "Título español":

118 xarticle.title_tex = value_tag.text

119 continue

120 if header_tag.text == "Título original":

121 xarticle.title_tex = value_tag.text

122 continue

123 if header_tag.text == "Título inglés":

124 xarticle.title_tex = value_tag.text

125 continue

126

127 # Author

128 if header_tag.text == "Autor/es":

129 authors_tags = value_tag.select("a")

130 for a in authors_tags:

131 author = create_contributor()

132 author["role"] = "author"

133 author["string_name"] = a.text

134 xarticle.contributors.append(author)

135 continue

136 # Page

137 if header_tag.text == "Publicación":

138 volume_re = list(regex.finditer(self.issue_regex, value_tag.text))

139 if len(volume_re) != 0:

140 # raise ValueError("Cannot parse Article page")

141 volume_data = volume_re[0].groupdict()

142

143 if volume_data["page_start"] and volume_data["page_end"]:

144 xarticle.page_range = (

145 volume_data["page_start"] + "-" + volume_data["page_end"]

146 )

147 if "volume" in volume_data:

148 issue_volume = volume_data["volume"]

149 if "number" in volume_data:

150 issue_number = volume_data["number"]

151 else:

152 raise ValueError("issue volume or number not found")

153

154 # LANG

155 if header_tag.text == "Idioma":

156 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"}

157 if value_tag.text in languages:

158 xarticle.lang = languages[value_tag.text]

159

160 return xarticle, issue_volume, issue_number

161

162 def crawl_issue(self, xissue: IssueData):

163 if hasattr(xissue, "url") and xissue.url:

164 content = self.download_file(xissue.url)

165 self.parse_issue_content(content, xissue)

166

167 dml_e_issues: dict[str, IssueData] = {}

168

169 xarticles = xissue.articles

170

171 for xarticle in xarticles:

172 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue)

173 if parsed_xarticle is None:

174 continue

175 if xissue_vol or xissue_number:

176 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "")

177 else:

178 issue_tag = xissue.year

179 if not issue_tag:

180 raise ValueError("issue_tag is None")

181 if issue_tag not in dml_e_issues:

182 dml_e_issues[issue_tag] = self.create_xissue(

183 xissue.url, xissue.year, xissue_vol, xissue_number or None

184 )

185 dml_e_issues[issue_tag].articles.append(parsed_xarticle)

186

187 for value in dml_e_issues.values():

188 if not self.test_mode and len(value.articles) > 0:

189 self.add_xissue_into_database(value)

190

191 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData):

192 parsed_xarticle = xarticle

193 if not hasattr(xarticle, "url") or not xarticle.url:

194 raise ValueError("article does not have an url")

195 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")

196

197 content = self.download_file(xarticle.url)

198 pid = f"{xissue.pid}_{xarticle.pid}"

199

200 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content(

201 content, xissue, xarticle, xarticle.url, pid

202 )

203

204 # The article title may have formulas surrounded with '$'

205 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number

Coverage for src/crawler/by_source/dml_e_crawler.py: 34%

144 statements