Coverage for src / crawler / by_source / arsia_crawler.py: 13%
82 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1import re
2from urllib.parse import urljoin
4from bs4 import BeautifulSoup
5from ptf.model_data import create_abstract, create_articledata, create_contributor
7from crawler.abstract_crawlers.matching_crawler import MatchingCrawler
8from crawler.utils import add_pdf_link_to_xarticle
11class ArsiaCrawler(MatchingCrawler):
12 source_name = "Ars Inveniendi Analytica website"
13 source_domain = "ARSIA"
14 source_website = "https://ars-ojs-utexas.tdl.org/ars/"
15 regex = re.compile(r"[\n\r\t]")
17 def parse_collection_content(self, content):
18 """
19 Discrete Analysis.
20 We ignore the journal web page and query Crossref to get the list of articles.
21 We query crossref for each article to get the list of xissues based on the publication date.
22 Each xissue has its year + list of articles with their URLs
23 """
25 soup = BeautifulSoup(content, "html.parser")
27 collection_tag = soup.select("ul.issues_archive li a.title")
28 volume_number = len(collection_tag)
29 xissues = []
30 for collection in collection_tag:
31 year = self.regex.sub("", collection.text)
33 url = collection.get("href")
34 xissues.append(
35 self.create_xissue(
36 urljoin(self.source_website, url),
37 year,
38 volume_number=str(volume_number),
39 issue_number="1",
40 )
41 )
42 volume_number -= 1
43 return xissues
45 def parse_issue_content(self, content, xissue):
46 soup = BeautifulSoup(content, "html.parser")
47 articles_tag = soup.select("ul.cmp_article_list li div.obj_article_summary")
49 article_number = len(articles_tag)
50 for article_tag in articles_tag:
51 xarticle = create_articledata()
52 url = article_tag.select_one("h3.title a").get("href")
53 xarticle.url = url
54 xarticle.doi = str(article_number) # for article without doi
55 article_number -= 1
56 xissue.articles.append(xarticle)
58 def parse_article_content(self, content, xissue, xarticle, url):
59 """
60 Parse the content with Beautifulsoup and returns an ArticleData
61 """
62 soup = BeautifulSoup(content, "html.parser")
64 self.get_metadata_using_citation_meta(
65 xarticle,
66 xissue,
67 soup,
68 [
69 "pdf",
70 "page",
71 "doi",
72 "publisher",
73 "citation_keywords",
74 "citation_reference",
75 ],
76 )
78 # Title
79 title_tag = soup.select_one("div.page_article article.obj_article_details h1.page_title")
80 if not title_tag:
81 raise ValueError(f"Couldn't parse title for article {xarticle.url}")
82 title = title_tag.text
84 # Authors
85 authors_tag = soup.select(
86 "div.page_article article.obj_article_details section.authors ul.authors li span.name"
87 )
88 if not authors_tag or len(authors_tag) == 0:
89 raise ValueError(f"Couldn't parse authors for article {xarticle.url}")
91 abstract_section = soup.select_one("div.main_entry section.abstract").findChildren("p")
92 if not abstract_section:
93 raise ValueError(f"Couldn't parse metadata_section for article {xarticle.url}")
95 # PDF
96 pdf_url = abstract_section[0].find("a").get("href")
97 if not pdf_url:
98 raise ValueError(f"Couldn't parse pdf url for article {xarticle.url}")
100 # Abstract
101 abstract = abstract_section[1]
102 if not abstract:
103 raise ValueError(f"Couldn't parse abstract for article {xarticle.url}")
104 abstract = abstract.text
106 # DOI
107 try:
108 doi = abstract_section[2].find("a").get("href")
109 if not doi or len(doi.split("/")) < 2:
110 raise ValueError(f"Couldn't parse DOI for article {xarticle.url}")
111 doi = ("/").join(doi.split("/")[-2:])
112 xarticle.doi = doi
113 except IndexError:
114 print(f"Couldn't parse DOI for article {xarticle.url}")
115 xarticle.pid = xissue.pid + "_" + xarticle.doi
116 xarticle.doi = None
117 except AttributeError:
118 for i in range(1, len(abstract_section)):
119 if abstract_section[i].findAll("a", {"href": True}):
120 doi = abstract_section[i].find("a").get("href")
121 doi = ("/").join(doi.split("/")[-2:])
122 xarticle.doi = doi
124 # Keywords
125 keywords = soup.select_one("div.main_entry section.keywords span")
126 if not keywords:
127 raise ValueError(f"Couldn't parse keywords for article {xarticle.url}")
128 keywords = self.regex.sub("", keywords.text).split(", ")
130 # Update xarticle
131 xarticle.lang = "en"
132 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract))
133 xarticle.url = url
134 xarticle.kwds = [
135 {"type": "", "lang": xarticle.lang, "value": keyword} for keyword in keywords
136 ]
137 add_pdf_link_to_xarticle(xarticle, pdf_url)
138 xarticle.title_tex = title
139 for contributor in authors_tag:
140 xarticle.contributors.append(
141 create_contributor(role="author", string_name=contributor.text)
142 )
144 return xarticle