Coverage for src/crawler/by_source/seio_crawler.py: 77%
116 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1from urllib.parse import urljoin
3import lingua
4import regex
5from bs4 import BeautifulSoup
6from lingua import LanguageDetectorBuilder
7from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
9from crawler.base_crawler import BaseCollectionCrawler
10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
13# STAT : currently only supports BEIO.
14# Issue and volume number parsing needs to be adapted for other collections
15class SeioCrawler(BaseCollectionCrawler):
16 source_name = "Sociedad de Estadistica e Investigación Operativa"
17 source_domain = "SEIO"
18 source_website = "https://www.seio.es/"
20 issue_re = r"(Número|Number) (?P<number>[\d ,]+) \/ [\w, ]+ (?P<year>\d{4})"
21 volume_re = r"Volumen? (?P<volume>\d+)"
23 language_detector = LanguageDetectorBuilder.from_languages(
24 lingua.Language.ENGLISH, lingua.Language.SPANISH, lingua.Language.FRENCH
25 ).build()
27 def parse_collection_content(self, content):
28 soup = BeautifulSoup(content, "html.parser")
29 xissues = []
30 issues_tags = soup.select(".et_pb_toggle a")
31 for i in issues_tags:
32 issue_search = regex.search(self.issue_re, i.text)
33 if not issue_search:
34 print(f"Couldn't parse issue data from string : {i.text}")
35 continue
36 issue_data = issue_search.groupdict()
38 parent_tag = i.find_parent(class_="et_pb_toggle")
39 volume_tag = parent_tag.select_one(".et_pb_toggle_title")
41 if not volume_tag: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Couldn't find volume")
43 volume_search = regex.search(self.volume_re, volume_tag.text)
44 if not volume_search: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 raise ValueError("Couldn't parse volume")
46 volume_dict = volume_search.groupdict()
47 if volume_dict["volume"] is None: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 raise ValueError("Couldn't parse volume text")
50 url = i.get("href")
51 if not isinstance(url, str): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError("Couldn't parse issue href")
54 xissues.append(
55 self.create_xissue(
56 urljoin(self.collection_url, url),
57 issue_data["year"],
58 volume_dict["volume"],
59 issue_data["number"].replace(", ", "-"),
60 )
61 )
62 return xissues
64 def crawl_issue(self, xissue):
65 if not xissue.url:
66 raise ValueError("Error while parsing issue : issue url must be set")
67 if xissue.url.endswith(".pdf"):
68 add_pdf_link_to_xarticle(xissue, xissue.url)
69 xissue.url = None
70 return super().crawl_issue(xissue)
72 def parse_issue_content(self, content, xissue):
73 soup = BeautifulSoup(content, "html.parser")
74 articles_tags = soup.select("h2.entry-title a")
75 for index, article_tag in enumerate(articles_tags):
76 xarticle = create_articledata()
77 xarticle.pid = f"a{index}"
78 url = article_tag.get("href")
79 if not isinstance(url, str): 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 raise ValueError("Couldn't parse article href")
81 xarticle.url = url
82 xissue.articles.append(xarticle)
84 def parse_article_content(self, content, xissue, xarticle, url):
85 soup = BeautifulSoup(content, "html.parser")
86 title_tag = soup.select_one("div#main-content h1.br--text")
87 if title_tag is None: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 raise ValueError("Couldn't parse title")
89 xarticle.title_tex = cleanup_str(title_tag.text)
91 keywords_tag = soup.select_one("div#main-content .keywords")
92 if keywords_tag:
93 keyword_header = keywords_tag.select_one("strong")
94 if keyword_header: 94 ↛ 97line 94 didn't jump to line 97 because the condition on line 94 was always true
95 keyword_header.decompose()
97 kwd_text = cleanup_str(keywords_tag.text)
99 for kwd_str in kwd_text.split(", "):
100 xarticle.kwds.append(create_subj(value=kwd_str))
101 keywords_tag.decompose()
103 # WARN : some abstracts have multiple paragraphs and/or lists
104 # https://www.seio.es/beio/grupo-de-tratamiento-y-analisis-de-big-data-tabida/
105 abstract_tag = soup.select_one("div#main-content .abstract-section")
106 if abstract_tag:
107 abstract_header = abstract_tag.select_one("h3")
108 if abstract_header: 108 ↛ 110line 108 didn't jump to line 110 because the condition on line 108 was always true
109 abstract_header.decompose()
110 abstract_str = abstract_tag.text
111 xarticle.lang = self.detect_language(abstract_str)
112 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_str))
114 # Unfortunately, SEIO authors are formatted differently depending on the issue.
115 # We can try to find as much as we can, but it's sometimes not straightforward to do so
116 # Those cases are currently ignored, thus no author will appear on Geodesic.
117 author_tags = soup.select("div#main-content .author-info .author > p")
118 if not author_tags or len(author_tags) == 0:
119 author_tags = soup.select("div#main-content p.author")
121 for author_tag in author_tags:
122 contributor = create_contributor(role="author")
124 author_name = (
125 author_tag.select_one("strong")
126 or author_tag.select_one("span:first-child > code")
127 or author_tag.select_one("code:first-child")
128 )
129 if not author_name: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 print(f"Couldn't find author name: {url}")
131 continue
133 contributor["string_name"] = author_name.text
134 author_name.decompose()
136 orcid_tag = author_tag.select_one("a[href^='https://orcid.org']")
137 if orcid_tag: 137 ↛ 148line 137 didn't jump to line 148 because the condition on line 137 was always true
138 orcid_url = orcid_tag.get("href")
139 if not isinstance(orcid_url, str): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 raise ValueError("Found orcid tag, but couldn't parse href")
142 orcid_search = regex.search(self.orcid_re, orcid_url)
143 if not orcid_search: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 raise ValueError("Found orcid url, but couldn't parse it.")
145 contributor["orcid"] = orcid_search.groupdict()["orcid"]
146 orcid_tag.decompose()
148 affiliation = next(
149 (e.text for e in author_tag if str(e.text) not in ("<br/>", "\n", ".")), None
150 )
152 if isinstance(affiliation, str): 152 ↛ 157line 152 didn't jump to line 157 because the condition on line 152 was always true
153 affiliation = cleanup_str(affiliation)
154 if len(affiliation) > 0: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 contributor["addresses"].append(cleanup_str(affiliation))
157 xarticle.contributors.append(contributor)
159 pdf_button = soup.select_one("#main-content a.et_pb_button:-soup-contains('PDF')")
160 if not pdf_button: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 raise ValueError("Couldn't find PDF link")
162 pdf_url = pdf_button.get("href")
163 if not isinstance(pdf_url, str): 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 raise ValueError("Couldn't parse pdf url")
165 add_pdf_link_to_xarticle(xarticle, pdf_url)
166 return xarticle