Coverage for src/crawler/by_source/jgaa_crawler.py: 86%
62 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import regex
2from bs4 import BeautifulSoup
3from ptf.model_data import create_abstract, create_articledata
5from crawler.base_crawler import BaseCollectionCrawler
6from crawler.utils import cleanup_str
9# https://jgaa.info/index.php/jgaa/oai?verb=ListRecords&metadataPrefix=oai_dc
10class JgaaCrawler(BaseCollectionCrawler):
11 source_name = "Journal of Graph Algorythms and Applications website"
12 source_domain = "JGAA"
13 source_website = "https://jgaa.info/"
15 issue_re = r"Vol\. (?P<volume>\d+) No\. (?P<number>\d+) \((?P<year>\d+)\)"
17 def parse_collection_content(self, content):
18 xissues = []
19 soup = BeautifulSoup(content, "html.parser")
20 issues = soup.select("div.obj_issue_summary > h2")
21 for issue in issues:
22 issue_title_str = ""
24 series_title = issue.select_one("a.title")
25 if not series_title: 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true
26 raise ValueError("Couldn't find issue title")
27 issue_href = series_title.get("href")
28 if not isinstance(issue_href, str): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true
29 raise ValueError("Couldn't parse issue url")
31 series_tag = issue.select_one("div.series")
32 if series_tag:
33 issue_title_str = series_title.text
34 else:
35 series_tag = series_title
37 issue_search = regex.search(self.issue_re, series_tag.text)
38 if not issue_search: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 raise ValueError("Couldn't parse volume year")
40 issue_dict = issue_search.groupdict()
42 xissue = self.create_xissue(
43 issue_href,
44 issue_dict["year"],
45 issue_dict["volume"],
46 issue_dict["number"],
47 )
48 xissue.title_tex = issue_title_str
49 xissues.append(xissue)
51 paginator = soup.select_one("a.next")
52 if paginator:
53 next_page_url = paginator.get("href")
54 if not isinstance(next_page_url, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError("Couldn't parse pagination url")
56 next_page_content = self.download_file(next_page_url)
57 xissues.extend(self.parse_collection_content(next_page_content))
59 return xissues
61 def parse_issue_content(self, content, xissue):
62 soup = BeautifulSoup(content, "html.parser")
63 articles = soup.select("a[id^='article']")
64 for index, article_tag in enumerate(articles):
65 article_url = article_tag.get("href")
66 if not isinstance(article_url, str): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 raise ValueError("Couldn't parse article url")
68 xarticle = create_articledata()
69 xarticle.pid = "a" + str(index)
70 xarticle.url = article_url
71 xissue.articles.append(xarticle)
73 def parse_article_content(self, content, xissue, xarticle, url, pid):
74 soup = BeautifulSoup(content, "html.parser")
75 self.get_metadata_using_citation_meta(
76 xarticle, xissue, soup, ["author", "lang", "pdf", "page", "doi", "title", "keywords"]
77 )
79 abstract_tag = soup.select_one("section.item.abstract")
80 if abstract_tag: 80 ↛ 89line 80 didn't jump to line 89 because the condition on line 80 was always true
81 label_tag = abstract_tag.select_one("h2.label")
82 if label_tag: 82 ↛ 84line 82 didn't jump to line 84 because the condition on line 82 was always true
83 label_tag.decompose()
84 xarticle.abstracts.append(
85 create_abstract(
86 tag="abstract", value_tex=cleanup_str(abstract_tag.text), lang=xarticle.lang
87 )
88 )
89 return super().parse_article_content(content, xissue, xarticle, url, pid)