Coverage for src/crawler/by_source/seio_crawler.py: 77%
113 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1from urllib.parse import urljoin
3import lingua
4import regex
5from bs4 import BeautifulSoup
6from lingua import LanguageDetectorBuilder
7from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
9from crawler.base_crawler import BaseCollectionCrawler
10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
13# STAT : currently only supports BEIO.
14# Issue and volume number parsing needs to be adapted for other collections
15class SeioCrawler(BaseCollectionCrawler):
16 source_name = "Sociedad de Estadistica e Investigación Operativa"
17 source_domain = "SEIO"
18 source_website = "https://www.seio.es/"
20 issue_re = r"(Número|Number) (?P<number>[\d ,]+) \/ [\w, ]+ (?P<year>\d{4})"
21 volume_re = r"Volumen? (?P<volume>\d+)"
23 language_detector = LanguageDetectorBuilder.from_languages(
24 lingua.Language.ENGLISH, lingua.Language.SPANISH, lingua.Language.FRENCH
25 ).build()
27 def parse_collection_content(self, content):
28 soup = BeautifulSoup(content, "html.parser")
29 xissues = []
30 issues_tags = soup.select(".et_pb_toggle a")
31 for i in issues_tags:
32 issue_search = regex.search(self.issue_re, i.text)
33 if not issue_search:
34 print(f"Couldn't parse issue data from string : {i.text}")
35 continue
36 issue_data = issue_search.groupdict()
38 parent_tag = i.find_parent(class_="et_pb_toggle")
39 volume_tag = parent_tag.select_one(".et_pb_toggle_title")
41 if not volume_tag: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Couldn't find volume")
44 volume_dict = regex_to_dict(
45 self.volume_re,
46 volume_tag.text,
47 error_msg="Couldn't parse volume",
48 )
49 if volume_dict["volume"] is None: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 raise ValueError("Couldn't parse volume text")
52 url = i.get("href")
53 if not isinstance(url, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 raise ValueError("Couldn't parse issue href")
56 xissues.append(
57 self.create_xissue(
58 urljoin(self.collection_url, url),
59 issue_data["year"],
60 volume_dict["volume"],
61 issue_data["number"].replace(", ", "-"),
62 )
63 )
64 return xissues
66 def crawl_issue(self, xissue):
67 if not xissue.url:
68 raise ValueError("Error while parsing issue : issue url must be set")
69 if xissue.url.endswith(".pdf"):
70 add_pdf_link_to_xarticle(xissue, xissue.url)
71 xissue.url = None
72 return super().crawl_issue(xissue)
74 def parse_issue_content(self, content, xissue):
75 soup = BeautifulSoup(content, "html.parser")
76 articles_tags = soup.select("h2.entry-title a")
77 for index, article_tag in enumerate(articles_tags):
78 xarticle = create_articledata()
79 xarticle.pid = f"a{index}"
80 url = article_tag.get("href")
81 if not isinstance(url, str): 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 raise ValueError("Couldn't parse article href")
83 xarticle.url = url
84 xissue.articles.append(xarticle)
86 def parse_article_content(self, content, xissue, xarticle, url):
87 soup = BeautifulSoup(content, "html.parser")
88 title_tag = soup.select_one("div#main-content h1.br--text")
89 if title_tag is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 raise ValueError("Couldn't parse title")
91 xarticle.title_tex = cleanup_str(title_tag.text)
93 keywords_tag = soup.select_one("div#main-content .keywords")
94 if keywords_tag:
95 keyword_header = keywords_tag.select_one("strong")
96 if keyword_header: 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was always true
97 keyword_header.decompose()
99 kwd_text = cleanup_str(keywords_tag.text)
101 for kwd_str in kwd_text.split(", "):
102 xarticle.kwds.append(create_subj(value=kwd_str))
103 keywords_tag.decompose()
105 # WARN : some abstracts have multiple paragraphs and/or lists
106 # https://www.seio.es/beio/grupo-de-tratamiento-y-analisis-de-big-data-tabida/
107 abstract_tag = soup.select_one("div#main-content .abstract-section")
108 if abstract_tag:
109 abstract_header = abstract_tag.select_one("h3")
110 if abstract_header: 110 ↛ 112line 110 didn't jump to line 112 because the condition on line 110 was always true
111 abstract_header.decompose()
112 abstract_str = abstract_tag.text
113 xarticle.lang = self.detect_language(abstract_str)
114 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_str))
116 # Unfortunately, SEIO authors are formatted differently depending on the issue.
117 # We can try to find as much as we can, but it's sometimes not straightforward to do so
118 # Those cases are currently ignored, thus no author will appear on Geodesic.
119 author_tags = soup.select("div#main-content .author-info .author > p")
120 if not author_tags or len(author_tags) == 0:
121 author_tags = soup.select("div#main-content p.author")
123 for author_tag in author_tags:
124 contributor = create_contributor(role="author")
126 author_name = (
127 author_tag.select_one("strong")
128 or author_tag.select_one("span:first-child > code")
129 or author_tag.select_one("code:first-child")
130 )
131 if not author_name: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 print(f"Couldn't find author name: {url}")
133 continue
135 contributor["string_name"] = author_name.text
136 author_name.decompose()
138 orcid_tag = author_tag.select_one("a[href^='https://orcid.org']")
139 if orcid_tag: 139 ↛ 150line 139 didn't jump to line 150 because the condition on line 139 was always true
140 orcid_url = orcid_tag.get("href")
141 if not isinstance(orcid_url, str): 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 raise ValueError("Found orcid tag, but couldn't parse href")
144 orcid_search = regex.search(self.orcid_re, orcid_url)
145 if not orcid_search: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 raise ValueError("Found orcid url, but couldn't parse it.")
147 contributor["orcid"] = orcid_search.groupdict()["orcid"]
148 orcid_tag.decompose()
150 affiliation = next(
151 (e.text for e in author_tag if str(e.text) not in ("<br/>", "\n", ".")), None
152 )
154 if isinstance(affiliation, str): 154 ↛ 159line 154 didn't jump to line 159 because the condition on line 154 was always true
155 affiliation = cleanup_str(affiliation)
156 if len(affiliation) > 0: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 contributor["addresses"].append(cleanup_str(affiliation))
159 xarticle.contributors.append(contributor)
161 pdf_button = soup.select_one("#main-content a.et_pb_button:-soup-contains('PDF')")
162 if not pdf_button: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 raise ValueError("Couldn't find PDF link")
164 pdf_url = pdf_button.get("href")
165 if not isinstance(pdf_url, str): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true
166 raise ValueError("Couldn't parse pdf url")
167 add_pdf_link_to_xarticle(xarticle, pdf_url)
168 return xarticle