Coverage for src/crawler/by_source/nsjom/nsjom_2010_crawler.py: 79%
109 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import re
2import typing
3from datetime import datetime
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12)
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true
17 from ..nsjom_crawler import NsjomCrawler
19source_domain = "NSJOM"
22def parse_collection_content(
23 self: "NsjomCrawler",
24 content: str,
25 source_domain: str = "NSJOM",
26):
27 """
28 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns2010.html
29 From 2010 to 2014 (included)
30 """
31 xissues: list[IssueData] = []
32 year_start = 2010
33 year_end = min(2014, datetime.now().year)
35 for year in range(year_start, year_end + 1):
36 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
37 year_content = self.download_file(url)
38 xissues = xissues + parse_year(self, year_content, year, url)
40 return xissues
43def is_heading(element: Tag):
44 if element.select_one(".HeadingNSJOM"):
45 return True
46 classname = element.get("class")
47 if not classname:
48 return False
50 if isinstance(classname, str): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 if classname == "HeadingNSJOM":
52 return True
53 return False
55 if "HeadingNSJOM" in classname:
56 return True
57 return False
60def parse_issue_content(self, content: str, xissue: IssueData):
61 if not xissue.year:
62 raise ValueError("Issue year is not set")
63 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
66def parse_year(
67 self: "NsjomCrawler",
68 content: str,
69 year: int,
70 url: str | None = None,
71 pid_to_crawl: str | None = None,
72):
73 soup = BeautifulSoup(content, "html.parser")
74 page_elements = soup.select("p.HeadingNSJOM, p.style1, p.style1+blockquote a")
75 issues: list[list[Tag]] = []
76 # Sort tags into issues
77 for current_element in page_elements:
78 if is_heading(current_element):
79 issues.append([current_element])
80 continue
81 if current_element.text == "\xa0":
82 continue
83 issues[-1].append(current_element)
85 xissues: list[IssueData] = []
86 for issue_elements in issues:
87 xissue = parse_issue_tags(self, issue_elements, year, pid_to_crawl)
88 if not xissue: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 continue
90 xissue.url = url
91 xissues.append(xissue)
93 return xissues
96def parse_issue_tags(
97 self: "NsjomCrawler", tags: list[Tag], year: int, pid_to_crawl: str | None = None
98):
99 xissue = create_issuedata()
100 xissue.year = str(year)
101 ext_link = create_extlink(
102 rel="source",
103 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
104 metadata=source_domain,
105 )
106 xissue.ext_links.append(ext_link)
107 issue_title_tag = tags.pop(0)
108 match = re.search(
109 r"NSJOM Vol\. (?P<Volume>\d+)(?:, No. (?P<Issue>\d+))?", issue_title_tag.text
110 )
111 if match is None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 raise ValueError("Cannot find volume number")
113 volume_number = match.group("Volume")
114 issue_number = match.group("Issue")
115 if volume_number: 115 ↛ 118line 115 didn't jump to line 118 because the condition on line 115 was always true
116 xissue.volume = volume_number
117 else:
118 raise ValueError("Cannot read volume number")
119 if issue_number: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true
120 xissue.number = issue_number
121 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
123 if pid_to_crawl and xissue.pid != pid_to_crawl: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 return
126 while len(tags) > 0:
127 article_meta = tags.pop(0)
128 article_meta_text = cleanup_str(article_meta.text)
129 if article_meta_text == "": 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 continue
132 article_title = tags.pop(0)
133 article = parse_article(
134 self,
135 article_meta_text,
136 article_title,
137 xissue.pid,
138 len(xissue.articles),
139 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
140 )
141 if article is None: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 continue
143 xissue.articles.append(article)
144 return xissue
147def parse_article(
148 self: "NsjomCrawler",
149 meta_text: str,
150 title_tag: Tag,
151 issue_pid: str,
152 index: int,
153 source: str | None = None,
154):
155 xarticle = create_articledata()
156 if source: 156 ↛ 159line 156 didn't jump to line 159 because the condition on line 156 was always true
157 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
158 xarticle.ext_links.append(ext_link)
159 match = re.search(r"\d+ \/ \d+ \/ \d+ \/ (?P<Pages>\d+(?:-\d+)?): (?P<Authors>.+)", meta_text)
160 if match is None: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 raise ValueError("Cannot parse authors or page number")
162 self.set_pages(xarticle, match.group("Pages"))
163 authors = re.findall(r"(?: and )?((?:(?<!,)(?<! and).(?!and ))+)", match.group("Authors"))
164 for a in authors:
165 author = create_contributor(role="author", string_name=a)
166 xarticle.contributors.append(author)
168 article_pdf_link = title_tag.get("href")
169 if article_pdf_link is None: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 print("[NSJOM] article does not have a pdf")
171 return None
172 if isinstance(article_pdf_link, list): 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true
173 raise ValueError("Article link is a list")
174 pdf_link = self.source_website + article_pdf_link
175 add_pdf_link_to_xarticle(xarticle, pdf_link)
177 title = cleanup_str(title_tag.text)
178 xarticle.title_tex = title
179 xarticle.pid = f"{issue_pid}a_{index}"
181 return xarticle