Coverage for src/crawler/by_source/dmlpl

1import json

2from urllib import parse

4from bs4 import BeautifulSoup, Tag

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj

7from crawler.base_crawler import BaseCollectionCrawler

8from crawler.by_source.lofpl_crawler import LofplCrawler

9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

12class DmlplCrawler(BaseCollectionCrawler):

13 source_name = "The Polish Digital Mathematics Library"

14 source_domain = "DMLPL"

15 source_website = "http://pldml.icm.edu.pl/pldml"

17 # HACK : Workaround for tests (monkeypatching)

18 # We store the class here, so we can monkeypatch it when running tests

19 subCrawlers = {LofplCrawler: None}

21 def parse_collection_content(self, content):

22 """

23 Parse the HTML page of Annals of Math and returns a list of xissue.

24 Each xissue has its pid/volume/number/year metadata + its url

25 """

26 issues = []

27 data = json.loads(content)

28 for entry in data:

29 link = self.source_website + "/tree/hierarchy.action"

30 params = {"root": entry["id"]}

31 link += "?" + parse.urlencode(params)

33 text: str = entry["text"]

34 if not text.startswith("tom/rocznik"):

35 raise ValueError(

36 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string'

37 )

38 soup = BeautifulSoup(text, "html.parser")

39 a_tags = soup.select("a")

40 if len(a_tags) < 2:

41 raise ValueError("Cannot parse Collection : couldn't find volume information")

42 volume = a_tags[0].text

43 year = a_tags[1].text

45 issues.extend(self.parse_dmlpl_volume_content(link, year, volume))

46 return issues

48 def parse_dmlpl_volume_content(self, link, year, volume):

49 content = self.download_file(link)

50 has_articles = False

51 issues = []

52 data = json.loads(content)

53 for entry in data:

54 entry_link = self.source_website + "/tree/hierarchy.action"

55 params = {"root": entry["id"]}

56 entry_link += "?" + parse.urlencode(params)

58 number = None

59 text: str = entry["text"]

60 if text.startswith("numer"):

61 soup = BeautifulSoup(text, "html.parser")

62 a_tag = soup.select_one("a")

63 if not a_tag:

64 raise ValueError("Cannot parse Collection : couldn't find issue information")

65 number = a_tag.text.replace(" ", "_")

66 issues.append(self.create_xissue(entry_link, year, volume, number))

67 elif text.startswith("artykuł"):

68 has_articles = True

70 if has_articles:

71 issues.append(self.create_xissue(link, year, volume))

73 return issues

75 def parse_issue_content(self, content, xissue):

76 data = json.loads(content)

77 for index, entry in enumerate(data):

78 xarticle = create_articledata()

79 xarticle.pid = "a" + str(index)

80 xarticle.url = self.source_website + "/element/" + entry["id"]

81 xissue.articles.append(xarticle)

83 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki)

84 def crawl_article(self, xarticle, xissue):

85 parsed_xarticle = xarticle

86 if hasattr(xarticle, "url") and xarticle.url:

87 response = self.get(xarticle.url)

89 # Crawl using LOFPL if detected

90 if response.url.startswith("https://bibliotekanauki.pl"):

91 xarticle.url = response.url.replace(

92 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api"

93 )

94 targetCrawler = self.subCrawlers[LofplCrawler]

95 if targetCrawler is None:

96 raise ValueError("Crawler incorrectly initialized")

97 parsed_xarticle = targetCrawler.crawl_article(xarticle, xissue)

98 elif response.url.startswith("http://pldml.icm.edu.pl"):

99 parsed_xarticle = super().crawl_article(xarticle, xissue)

100 else:

101 raise NotImplementedError

102

103 if not parsed_xarticle:

104 raise ValueError("Couldn't crawl article")

105 # The article title may have formulas surrounded with '$'

106 return self.process_article_metadata(parsed_xarticle)

107

108 def parse_dmlpl_generic_page(self, content: str):

109 soup = BeautifulSoup(content, "html.parser")

110 main = soup.select_one("div.details-content")

111 if not main:

112 raise ValueError("Cannot parse article : main div not found")

113

114 sections = main.select("div.row")

115 sections_dict: dict[str, Tag] = {}

116 for s in sections:

117 row_label = s.select_one("div.row-label")

118 if not row_label:

119 raise ValueError("Cannot parse article : row label not found")

120 tag = s.select_one("div.row-desc")

121 if tag:

122 sections_dict[row_label.text] = tag

123

124 return sections_dict

125

126 def parse_article_content(self, content, xissue, xarticle, url):

127 sections_dict = self.parse_dmlpl_generic_page(content)

128

129 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text)

130

131 # Author

132 for a_tag in sections_dict["Autorzy"].select("a"):

133 href = a_tag.get("href")

134 if not isinstance(href, str):

135 raise ValueError("author href is not a string")

136 author = self.parse_author(self.download_file(self.source_website + "/" + href))

137 author["role"] = "author"

138 xarticle.contributors.append(author)

139

140 # TODO : Contributor ? (Twórcy)

141

142 # PDF

143 if "Treść / Zawartość" in sections_dict:

144 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a")

145 if not pdf_a_tag:

146 raise ValueError("Cannot find pdf for article")

147 pdf_url = pdf_a_tag.get("href")

148 if not isinstance(pdf_url, str):

149 raise ValueError("Cannot parse pdf url for article")

150 if not pdf_url.startswith("http"):

151 pdf_url = self.source_website + "/" + pdf_url

152 add_pdf_link_to_xarticle(xarticle, pdf_url)

153 else:

154 self.logger.info("PDF not found", extra={"pid": xarticle.pid})

155

156 # Lang

157 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower())

158 if len(xarticle.lang) > 3:

159 if xarticle.lang == "pl fr":

160 xarticle.lang = "pl"

161 self.logger.info(

162 f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)",

163 extra={"pid": xarticle.pid},

164 )

165 else:

166 raise ValueError("Cannot parse article lang")

167

168 # Abstract

169 if "Abstrakty" in sections_dict:

170 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row")

171 for div in abstract_divs:

172 lang = "und"

173 lang_div = div.select_one("div.articleDetails-langCell")

174 if lang_div:

175 lang = cleanup_str(lang_div.text).lower()

176 text_div = div.select_one("div.articleDetails-abstract")

177 if not text_div:

178 raise ValueError(

179 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed"

180 )

181 abstract_text = cleanup_str(text_div.text)

182 if abstract_text != "-":

183 xabstract = create_abstract(tag="abstract", value_tex=abstract_text, lang=lang)

184 xarticle.abstracts.append(xabstract)

185

186 # Keywords

187 if "Słowa kluczowe" in sections_dict:

188 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row")

189 for list in keywords_lists:

190 lang = "und"

191 lang_div = list.select_one("div.articleDetails-langCell")

192 keywords_a_tags = list.select("a")

193 for a_tag in keywords_a_tags:

194 subject = create_subj()

195 subject["value"] = a_tag.text

196 subject["lang"] = lang

197 xarticle.kwds.append(subject)

198 # Page

199 if "Strony" in sections_dict:

200 self.set_pages(xarticle, cleanup_str(sections_dict["Strony"].text))

201

202 return xarticle

203

204 def parse_author(self, content: str):

205 author = create_contributor()

206 sections_dict = self.parse_dmlpl_generic_page(content)

207 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text)

208 author["first_name"] = cleanup_str(sections_dict["Imię"].text)

209 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0:

210 author["string_name"] = cleanup_str(sections_dict["Twórca"].text)

211 return author

Coverage for src/crawler/by_source/dmlpl_crawler.py: 8%

154 statements