Coverage for src/crawler/by_source/nsjom/nsjom_1971_crawler.py: 83%
98 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import re
2import typing
3from datetime import datetime
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12)
14from crawler.base_crawler import add_pdf_link_to_xarticle
15from crawler.utils import cleanup_str
17if typing.TYPE_CHECKING: 17 ↛ 18line 17 didn't jump to line 18 because the condition on line 17 was never true
18 from ..nsjom_crawler import NsjomCrawler
21source_domain = "NSJOM"
24def parse_collection_content(
25 self: "NsjomCrawler",
26 content: str,
27 periode_start: int = 0,
28 periode_end: int = datetime.now().year,
29 source_domain: str = "NSJOM",
30):
31 """
32 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
33 From 1971 to 2009 (included)
34 """
35 xissues: list[IssueData] = []
36 year_start = max(1971, periode_start)
37 year_end = min(2009, periode_end)
39 for year in range(year_start, year_end + 1):
40 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
41 year_content = self.get_page_content(url)
42 try:
43 xissues = xissues + parse_year(self, year_content, year, url)
44 except ValueError as e:
45 # Adds message to printed error
46 e.add_note(f"[{source_domain}]: {year}")
47 raise
48 return xissues
51def parse_issue_content(self, content: str, xissue: IssueData):
52 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
55def parse_year(
56 self: "NsjomCrawler",
57 content: str,
58 year: int,
59 url: str | None = None,
60 pid_to_parse: str | None = None,
61):
62 """Parses one page.
63 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html"""
64 soup = BeautifulSoup(content, "html.parser")
65 xissues: list[IssueData] = []
66 issues_tags = soup.select("body>table")
67 for issue_tag in issues_tags:
68 xissue = parse_issue_tag(self, issue_tag, year)
69 if not xissue: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 continue
71 xissue.url = url
72 xissues.append(xissue)
73 return xissues
76def parse_issue_tag(
77 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None
78):
79 """Parses one issue tag.
80 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
81 """
82 xissue = create_issuedata()
83 xissue.year = str(year)
84 ext_link = create_extlink(
85 rel="source",
86 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
87 metadata=source_domain,
88 )
89 xissue.ext_links.append(ext_link)
91 table_lines = issue_tag.select("tr")
92 issue_title_tag = table_lines.pop(0)
93 match = re.search(
94 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?",
95 issue_title_tag.text,
96 )
97 # Issue Summary ?
98 if match is None: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 raise ValueError("Cannot find volume number")
100 volume_number = match.group("Volume")
101 issue_number = match.group("Issue")
102 if volume_number: 102 ↛ 105line 102 didn't jump to line 105 because the condition on line 102 was always true
103 xissue.volume = volume_number
104 else:
105 raise ValueError("Cannot read volume number")
106 if issue_number:
107 xissue.number = issue_number
108 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
110 if pid_to_parse and xissue.pid != pid_to_parse: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true
111 return
112 table_lines.pop(0) # table header
113 for index, table_line_tag in enumerate(table_lines):
114 try:
115 xarticle = parse_article_tag(
116 self,
117 table_line_tag,
118 xissue.pid,
119 index,
120 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
121 )
122 except ValueError as e:
123 e.add_note(f"{volume_number}_{issue_number}")
124 raise
125 if xarticle:
126 xissue.articles.append(xarticle)
127 return xissue
130def parse_article_tag(
131 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None
132):
133 """Parses one article tag.
134 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
135 """
136 xarticle = create_articledata()
137 if source: 137 ↛ 141line 137 didn't jump to line 141 because the condition on line 137 was always true
138 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
139 xarticle.ext_links.append(ext_link)
141 article_data = article_tag.select("td")
142 if len(article_data) != 3: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true
143 raise ValueError("Issue table doesn't have three columns")
144 author_tag, title_tag, pages_tag = article_data
146 authors = [
147 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>")
148 ]
149 for author_name in authors:
150 if author_name == "":
151 continue
153 author_name = cleanup_str(author_name)
155 author = create_contributor(role="author", string_name=author_name)
156 xarticle.contributors.append(author)
158 title = cleanup_str(title_tag.text)
159 xarticle.title_tex = title
160 xarticle.pid = f"{issue_pid}a_{index}"
162 title_link_tag = title_tag.select_one("a")
163 if title_link_tag is None:
164 print(f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link")
165 return None
166 pdf_link = title_link_tag.get("href")
167 if pdf_link is None: 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true
168 raise ValueError("Article pdf link is None")
169 if isinstance(pdf_link, list): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 raise ValueError("Article has multiple pdf hrefs")
171 pdf_link = self.source_website + pdf_link
172 add_pdf_link_to_xarticle(xarticle, pdf_link)
174 xarticle.page_range = cleanup_str(pages_tag.text)
175 return xarticle