Coverage for src/crawler/by_source/seio_crawler.py: 76%
113 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
11# STAT : currently only supports BEIO.
12# Issue and volume number parsing needs to be adapted for other collections
13class SeioCrawler(BaseCollectionCrawler):
14 source_name = "Sociedad de Estadistica e Investigación Operativa"
15 source_domain = "SEIO"
16 source_website = "https://www.seio.es/"
18 issue_re = r"(Número|Number) (?P<number>[\d ,]+) \/ [\w, ]+ (?P<year>\d{4})"
19 volume_re = r"Volumen? (?P<volume>\d+)"
21 def parse_collection_content(self, content):
22 soup = BeautifulSoup(content, "html.parser")
23 xissues = []
24 issues_tags = soup.select(".et_pb_toggle a")
25 for i in issues_tags:
26 issue_search = regex.search(self.issue_re, i.text)
27 if not issue_search:
28 print(f"Couldn't parse issue data from string : {i.text}")
29 continue
30 issue_data = issue_search.groupdict()
32 parent_tag = i.find_parent(class_="et_pb_toggle")
33 volume_tag = parent_tag.select_one(".et_pb_toggle_title")
35 if not volume_tag: 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 raise ValueError("Couldn't find volume")
37 volume_search = regex.search(self.volume_re, volume_tag.text)
38 if not volume_search: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 raise ValueError("Couldn't parse volume")
40 volume_dict = volume_search.groupdict()
41 if volume_dict["volume"] is None: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Couldn't parse volume text")
44 url = i.get("href")
45 if not isinstance(url, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Couldn't parse issue href")
48 xissues.append(
49 self.create_xissue(
50 urljoin(self.collection_url, url),
51 issue_data["year"],
52 volume_dict["volume"],
53 issue_data["number"].replace(", ", "-"),
54 )
55 )
56 return xissues
58 def crawl_issue(self, xissue):
59 if not xissue.url:
60 raise ValueError("Error while parsing issue : issue url must be set")
61 if xissue.url.endswith(".pdf"):
62 add_pdf_link_to_xarticle(xissue, xissue.url)
63 xissue.url = None
64 return super().crawl_issue(xissue)
66 def parse_issue_content(self, content, xissue):
67 soup = BeautifulSoup(content, "html.parser")
68 articles_tags = soup.select("h2.entry-title a")
69 for index, article_tag in enumerate(articles_tags):
70 xarticle = create_articledata()
71 xarticle.pid = f"a{index}"
72 url = article_tag.get("href")
73 if not isinstance(url, str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 raise ValueError("Couldn't parse article href")
75 xarticle.url = url
76 xissue.articles.append(xarticle)
78 def parse_article_content(self, content, xissue, xarticle, url, pid):
79 soup = BeautifulSoup(content, "html.parser")
80 title_tag = soup.select_one("div#main-content h1.br--text")
81 if title_tag is None: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 raise ValueError("Couldn't parse title")
83 xarticle.title_tex = cleanup_str(title_tag.text)
85 keywords_tag = soup.select_one("div#main-content .keywords")
86 if keywords_tag:
87 keyword_header = keywords_tag.select_one("strong")
88 if keyword_header: 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true
89 keyword_header.decompose()
91 kwd_text = cleanup_str(keywords_tag.text)
93 for kwd_str in kwd_text.split(", "):
94 xarticle.kwds.append(create_subj(value=kwd_str))
95 keywords_tag.decompose()
97 # WARN : some abstracts have multiple paragraphs and/or lists
98 # https://www.seio.es/beio/grupo-de-tratamiento-y-analisis-de-big-data-tabida/
99 abstract_tag = soup.select_one("div#main-content .abstract-section")
100 if abstract_tag:
101 abstract_header = abstract_tag.select_one("h3")
102 if abstract_header: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true
103 abstract_header.decompose()
104 abstract_str = abstract_tag.text
105 xarticle.lang = self.detect_language(abstract_str)
106 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_str))
108 # Unfortunately, SEIO authors are formatted differently depending on the issue.
109 # We can try to find as much as we can, but it's sometimes not straightforward to do so
110 # Those cases are currently ignored, thus no author will appear on Geodesic.
111 author_tags = soup.select("div#main-content .author-info .author > p")
112 if not author_tags or len(author_tags) == 0:
113 author_tags = soup.select("div#main-content p.author")
115 for author_tag in author_tags:
116 contributor = create_contributor(role="author")
118 author_name = (
119 author_tag.select_one("strong")
120 or author_tag.select_one("span:first-child > code")
121 or author_tag.select_one("code:first-child")
122 )
123 if not author_name: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 print(f"Couldn't find author name: {url}")
125 continue
127 contributor["string_name"] = author_name.text
128 author_name.decompose()
130 orcid_tag = author_tag.select_one("a[href^='https://orcid.org']")
131 if orcid_tag: 131 ↛ 142line 131 didn't jump to line 142 because the condition on line 131 was always true
132 orcid_url = orcid_tag.get("href")
133 if not isinstance(orcid_url, str): 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 raise ValueError("Found orcid tag, but couldn't parse href")
136 orcid_search = regex.search(self.orcid_re, orcid_url)
137 if not orcid_search: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true
138 raise ValueError("Found orcid url, but couldn't parse it.")
139 contributor["orcid"] = orcid_search.groupdict()["orcid"]
140 orcid_tag.decompose()
142 affiliation = next(
143 (e.text for e in author_tag if str(e.text) not in ("<br/>", "\n", ".")), None
144 )
146 if isinstance(affiliation, str): 146 ↛ 151line 146 didn't jump to line 151 because the condition on line 146 was always true
147 affiliation = cleanup_str(affiliation)
148 if len(affiliation) > 0: 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true
149 contributor["addresses"].append(cleanup_str(affiliation))
151 xarticle.contributors.append(contributor)
153 pdf_button = soup.select_one("#main-content a.et_pb_button:-soup-contains('PDF')")
154 if not pdf_button: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 raise ValueError("Couldn't find PDF link")
156 pdf_url = pdf_button.get("href")
157 if not isinstance(pdf_url, str): 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 raise ValueError("Couldn't parse pdf url")
159 add_pdf_link_to_xarticle(xarticle, pdf_url)
160 return super().parse_article_content(content, xissue, xarticle, url, pid) #