Coverage for src/crawler/by_source/seio

1from urllib.parse import urljoin

3import lingua

4import regex

5from bs4 import BeautifulSoup

6from lingua import LanguageDetectorBuilder

7from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj

9from crawler.base_crawler import BaseCollectionCrawler

10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict

13# STAT : currently only supports BEIO.

14# Issue and volume number parsing needs to be adapted for other collections

15class SeioCrawler(BaseCollectionCrawler):

16 source_name = "Sociedad de Estadistica e Investigación Operativa"

17 source_domain = "SEIO"

18 source_website = "https://www.seio.es/"

20 issue_re = r"(Número|Number) (?P<number>[\d ,]+) \/ [\w, ]+ (?P<year>\d{4})"

21 volume_re = r"Volumen? (?P<volume>\d+)"

23 language_detector = LanguageDetectorBuilder.from_languages(

24 lingua.Language.ENGLISH, lingua.Language.SPANISH, lingua.Language.FRENCH

25 ).build()

27 def parse_collection_content(self, content):

28 soup = BeautifulSoup(content, "html.parser")

29 xissues = []

30 issues_tags = soup.select(".et_pb_toggle a")

31 for i in issues_tags:

32 issue_search = regex.search(self.issue_re, i.text)

33 if not issue_search:

34 self.logger.debug(f"Couldn't parse issue data from string : {i.text}")

35 continue

36 issue_data = issue_search.groupdict()

38 parent_tag = i.find_parent(class_="et_pb_toggle")

39 volume_tag = parent_tag.select_one(".et_pb_toggle_title")

41 if not volume_tag: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't find volume")

44 volume_dict = regex_to_dict(

45 self.volume_re,

46 volume_tag.text,

47 error_msg="Couldn't parse volume",

48 )

49 if volume_dict["volume"] is None: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 raise ValueError("Couldn't parse volume text")

52 url = i.get("href")

53 if not isinstance(url, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError("Couldn't parse issue href")

56 xissues.append(

57 self.create_xissue(

58 urljoin(self.collection_url, url),

59 issue_data["year"],

60 volume_dict["volume"],

61 issue_data["number"].replace(", ", "-"),

62 )

63 )

64 return xissues

66 def crawl_issue(self, xissue):

67 if not xissue.url:

68 raise ValueError("Error while parsing issue : issue url must be set")

69 if xissue.url.endswith(".pdf"):

70 add_pdf_link_to_xarticle(xissue, xissue.url)

71 xissue.url = None

72 return super().crawl_issue(xissue)

74 def parse_issue_content(self, content, xissue):

75 soup = BeautifulSoup(content, "html.parser")

76 articles_tags = soup.select("h2.entry-title a")

77 for index, article_tag in enumerate(articles_tags):

78 xarticle = create_articledata()

79 xarticle.pid = f"a{index}"

80 url = article_tag.get("href")

81 if not isinstance(url, str): 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 raise ValueError("Couldn't parse article href")

83 xarticle.url = url

84 xissue.articles.append(xarticle)

86 def parse_article_content(self, content, xissue, xarticle, url):

87 soup = BeautifulSoup(content, "html.parser")

88 title_tag = soup.select_one("div#main-content h1.br--text")

89 if title_tag is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 raise ValueError("Couldn't parse title")

91 xarticle.title_tex = cleanup_str(title_tag.text)

93 keywords_tag = soup.select_one("div#main-content .keywords")

94 if keywords_tag:

95 keyword_header = keywords_tag.select_one("strong")

96 if keyword_header: 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was always true

97 keyword_header.decompose()

99 kwd_text = cleanup_str(keywords_tag.text)

100

101 for kwd_str in kwd_text.split(", "):

102 xarticle.kwds.append(create_subj(value=kwd_str))

103 keywords_tag.decompose()

104

105 # WARN : some abstracts have multiple paragraphs and/or lists

106 # https://www.seio.es/beio/grupo-de-tratamiento-y-analisis-de-big-data-tabida/

107 abstract_tag = soup.select_one("div#main-content .abstract-section")

108 if abstract_tag:

109 abstract_header = abstract_tag.select_one("h3")

110 if abstract_header: 110 ↛ 112line 110 didn't jump to line 112 because the condition on line 110 was always true

111 abstract_header.decompose()

112 abstract_str = abstract_tag.text

113 xarticle.lang = self.detect_language(abstract_str)

114 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_str))

115

116 # Unfortunately, SEIO authors are formatted differently depending on the issue.

117 # We can try to find as much as we can, but it's sometimes not straightforward to do so

118 # Those cases are currently ignored, thus no author will appear on Geodesic.

119 author_tags = soup.select("div#main-content .author-info .author > p")

120 if not author_tags or len(author_tags) == 0:

121 author_tags = soup.select("div#main-content p.author")

122

123 for author_tag in author_tags:

124 contributor = create_contributor(role="author")

125

126 author_name = (

127 author_tag.select_one("strong")

128 or author_tag.select_one("span:first-child > code")

129 or author_tag.select_one("code:first-child")

130 )

131 if not author_name: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 self.logger.debug(

133 "Couldn't find author name", extra={"pid": xarticle.pid, "url": url}

134 )

135 continue

136

137 contributor["string_name"] = author_name.text

138 author_name.decompose()

139

140 orcid_tag = author_tag.select_one("a[href^='https://orcid.org']")

141 if orcid_tag: 141 ↛ 152line 141 didn't jump to line 152 because the condition on line 141 was always true

142 orcid_url = orcid_tag.get("href")

143 if not isinstance(orcid_url, str): 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 raise ValueError("Found orcid tag, but couldn't parse href")

145

146 orcid_search = regex.search(self.orcid_re, orcid_url)

147 if not orcid_search: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 raise ValueError("Found orcid url, but couldn't parse it.")

149 contributor["orcid"] = orcid_search.groupdict()["orcid"]

150 orcid_tag.decompose()

151

152 affiliation = next(

153 (e.text for e in author_tag if str(e.text) not in ("<br/>", "\n", ".")), None

154 )

155

156 if isinstance(affiliation, str): 156 ↛ 161line 156 didn't jump to line 161 because the condition on line 156 was always true

157 affiliation = cleanup_str(affiliation)

158 if len(affiliation) > 0: 158 ↛ 159line 158 didn't jump to line 159 because the condition on line 158 was never true

159 contributor["addresses"].append(cleanup_str(affiliation))

160

161 xarticle.contributors.append(contributor)

162

163 pdf_button = soup.select_one("#main-content a.et_pb_button:-soup-contains('PDF')")

164 if not pdf_button: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 raise ValueError("Couldn't find PDF link")

166 pdf_url = pdf_button.get("href")

167 if not isinstance(pdf_url, str): 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 raise ValueError("Couldn't parse pdf url")

169 add_pdf_link_to_xarticle(xarticle, pdf_url)

170 return xarticle

Coverage for src/crawler/by_source/seio_crawler.py: 77%

113 statements