Coverage for src / crawler / by_source / arsia_crawler.py: 13%

82 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1import re 

2from urllib.parse import urljoin 

3 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_abstract, create_articledata, create_contributor 

6 

7from crawler.abstract_crawlers.matching_crawler import MatchingCrawler 

8from crawler.utils import add_pdf_link_to_xarticle 

9 

10 

11class ArsiaCrawler(MatchingCrawler): 

12 source_name = "Ars Inveniendi Analytica website" 

13 source_domain = "ARSIA" 

14 source_website = "https://ars-ojs-utexas.tdl.org/ars/" 

15 regex = re.compile(r"[\n\r\t]") 

16 

17 def parse_collection_content(self, content): 

18 """ 

19 Discrete Analysis. 

20 We ignore the journal web page and query Crossref to get the list of articles. 

21 We query crossref for each article to get the list of xissues based on the publication date. 

22 Each xissue has its year + list of articles with their URLs 

23 """ 

24 

25 soup = BeautifulSoup(content, "html.parser") 

26 

27 collection_tag = soup.select("ul.issues_archive li a.title") 

28 volume_number = len(collection_tag) 

29 xissues = [] 

30 for collection in collection_tag: 

31 year = self.regex.sub("", collection.text) 

32 

33 url = collection.get("href") 

34 xissues.append( 

35 self.create_xissue( 

36 urljoin(self.source_website, url), 

37 year, 

38 volume_number=str(volume_number), 

39 issue_number="1", 

40 ) 

41 ) 

42 volume_number -= 1 

43 return xissues 

44 

45 def parse_issue_content(self, content, xissue): 

46 soup = BeautifulSoup(content, "html.parser") 

47 articles_tag = soup.select("ul.cmp_article_list li div.obj_article_summary") 

48 

49 article_number = len(articles_tag) 

50 for article_tag in articles_tag: 

51 xarticle = create_articledata() 

52 url = article_tag.select_one("h3.title a").get("href") 

53 xarticle.url = url 

54 xarticle.doi = str(article_number) # for article without doi 

55 article_number -= 1 

56 xissue.articles.append(xarticle) 

57 

58 def parse_article_content(self, content, xissue, xarticle, url): 

59 """ 

60 Parse the content with Beautifulsoup and returns an ArticleData 

61 """ 

62 soup = BeautifulSoup(content, "html.parser") 

63 

64 self.get_metadata_using_citation_meta( 

65 xarticle, 

66 xissue, 

67 soup, 

68 [ 

69 "pdf", 

70 "page", 

71 "doi", 

72 "publisher", 

73 "citation_keywords", 

74 "citation_reference", 

75 ], 

76 ) 

77 

78 # Title 

79 title_tag = soup.select_one("div.page_article article.obj_article_details h1.page_title") 

80 if not title_tag: 

81 raise ValueError(f"Couldn't parse title for article {xarticle.url}") 

82 title = title_tag.text 

83 

84 # Authors 

85 authors_tag = soup.select( 

86 "div.page_article article.obj_article_details section.authors ul.authors li span.name" 

87 ) 

88 if not authors_tag or len(authors_tag) == 0: 

89 raise ValueError(f"Couldn't parse authors for article {xarticle.url}") 

90 

91 abstract_section = soup.select_one("div.main_entry section.abstract").findChildren("p") 

92 if not abstract_section: 

93 raise ValueError(f"Couldn't parse metadata_section for article {xarticle.url}") 

94 

95 # PDF 

96 pdf_url = abstract_section[0].find("a").get("href") 

97 if not pdf_url: 

98 raise ValueError(f"Couldn't parse pdf url for article {xarticle.url}") 

99 

100 # Abstract 

101 abstract = abstract_section[1] 

102 if not abstract: 

103 raise ValueError(f"Couldn't parse abstract for article {xarticle.url}") 

104 abstract = abstract.text 

105 

106 # DOI 

107 try: 

108 doi = abstract_section[2].find("a").get("href") 

109 if not doi or len(doi.split("/")) < 2: 

110 raise ValueError(f"Couldn't parse DOI for article {xarticle.url}") 

111 doi = ("/").join(doi.split("/")[-2:]) 

112 xarticle.doi = doi 

113 except IndexError: 

114 print(f"Couldn't parse DOI for article {xarticle.url}") 

115 xarticle.pid = xissue.pid + "_" + xarticle.doi 

116 xarticle.doi = None 

117 except AttributeError: 

118 for i in range(1, len(abstract_section)): 

119 if abstract_section[i].findAll("a", {"href": True}): 

120 doi = abstract_section[i].find("a").get("href") 

121 doi = ("/").join(doi.split("/")[-2:]) 

122 xarticle.doi = doi 

123 

124 # Keywords 

125 keywords = soup.select_one("div.main_entry section.keywords span") 

126 if not keywords: 

127 raise ValueError(f"Couldn't parse keywords for article {xarticle.url}") 

128 keywords = self.regex.sub("", keywords.text).split(", ") 

129 

130 # Update xarticle 

131 xarticle.lang = "en" 

132 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract)) 

133 xarticle.url = url 

134 xarticle.kwds = [ 

135 {"type": "", "lang": xarticle.lang, "value": keyword} for keyword in keywords 

136 ] 

137 add_pdf_link_to_xarticle(xarticle, pdf_url) 

138 xarticle.title_tex = title 

139 for contributor in authors_tag: 

140 xarticle.contributors.append( 

141 create_contributor(role="author", string_name=contributor.text) 

142 ) 

143 

144 return xarticle