#!/usr/bin/python3

#	gathered-info-processor : Utility for processing the gathered_info.txt file
#	Copyright (C) 2026 Alexey Appolonov
#
#	This program is free software: you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation, either version 3 of the License, or
#	(at your option) any later version.
#
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with this program.  If not, see <http://www.gnu.org/licenses/>.

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

import argparse
import os

DESCRIPTION = '''
	Process gathered info in various ways; Gathered info is a CSV file, where
	each line has the following format: "<initial_url_i>, <url_i1> <url_i2> .. 
	<url_in> | <lang_i1> <lang_i2> .. <lang_im>"'''

O_COMPL = 'complement_urls'
OPERATIONS = (O_COMPL,)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

argparser = argparse.ArgumentParser(description=DESCRIPTION)
argparser.add_argument(
	'-o', '--operation',
	metavar='OPERATION_NAME', type=str, choices=OPERATIONS, required=True,
	help=f'The operation to be performed ({", ".join(OPERATIONS)})'
	)
argparser.add_argument(
	'-f', '--file_path',
	metavar='ABS_FILE_PATH', type=str, required=True,
	help='Absolute path of a file that contains gathered info'
	)
argparser.add_argument(
	'--rewrite',
	action='store_true',
	help='Write complemented info back to the initial file'
	)
args = argparser.parse_args()

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def GetCols(line):

	''' Convert a string "<a> >> <b0> <b1> .. <bn> | <c0> <c1> .. <cnm>"
	to a tuple ("<a>", "<b0> <b1> .. <bn>", "<c0> <c1> .. <cnm>")'''
	
	line = line.strip()
	if not line:
		return tuple()

	cols = line.split(' >> ')
	if len(cols) != 2:
		return None

	subcols = cols[1].split(' | ')
	if len(subcols) < 2:
		return None

	return cols[0], subcols[0], subcols[1]

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ReadGatheredInfo():

	'''Read gathered info from the specified file; The first element
	of the returned tuple is a dict {<url>: ([<urls>], [<langs>]},
	and the second element is a list of warnings'''

	if not os.path.exists(args.file_path):
		print(f'[ERROR: File {args.file_path} does not exist]')
		exit(1)

	if not os.path.isfile(args.file_path):
		print(f'[ERROR: File {args.file_path} is not a regular file]')
		exit(1)

	res = {}
	warnings = []

	try:
		with open(args.file_path, 'r') as f:
			for i, line in enumerate(f.readlines()):
				cols = GetCols(line)
				if not cols:
					if cols == None:
						warnings.append(f'Wrong format in line {i + 1}')
					continue
				res[cols[0]] = (cols[1].split(), cols[2].split())
	except OSError:
		print(f'[ERROR: Can\'t read {args.file_path}]')
		exit(1)

	return res, warnings

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def WriteComplementBackToGatheredInfo(complement):

	'''
	Complement the original data and write it back to the initial file or
	to a new one (depending on the flag "--rewrite")'''

	if args.rewrite:
		new_file_path = args.file_path
	else:
		i = 0
		while True:
			new_file_path = f'{args.file_path}.compl{i if i > 0 else ""}'
			if not os.path.exists(new_file_path):
				break
			i += 1

	try:
		txt = ''
		# Read
		with open(args.file_path, 'r') as f:
			for i, line in enumerate(f.readlines()):
				cols = GetCols(line)
				if not cols:
					continue
				c_info = complement.get(cols[0])
				if not c_info:
					txt += line
					continue
				txt += f'{cols[0]} >> ' \
					f'{" ".join(c_info[0])} | ' \
					f'{" ".join(c_info[1])}\n'
		# Write
		with open(new_file_path, 'w') as f:
			f.write(txt)
	except OSError:
		print(f'[ERROR: Can\'t read {args.file_path}]')
		exit(2)

	return new_file_path

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def ComplementGatheredInfo(gathered_info):

	'''Produce a complementary dict {<url_b>: ([<url_a> ..], [..]) for records
	<url_a>, <url_b> ... | ...
	when there is another record
	<url_b>, ... | ...
	that lacks the connection with <url_a> (they should be interconnected)'''

	complement = {}

	for init_url, g_info in gathered_info.items():
		g_urls, g_langs = g_info
		for g_url in g_urls:
			if g_url == '-' or init_url.lower() == g_url.lower():
				continue
			reverse_search = gathered_info.get(g_url)
			if not reverse_search:
				continue
			r_urls, r_langs = reverse_search
			if init_url.lower() in [r_url.lower() for r_url in r_urls]:
				continue
			c_urls0 = complement.get(g_url, ([], []))[0]
			c_urls = {init_url} | set(c_urls0) | \
				{r_url for r_url in r_urls if r_url != '-'}
			complement[g_url] = (sorted(c_urls), sorted(r_langs))

	return complement

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

if __name__ == '__main__':

	gathered_info, warnings = ReadGatheredInfo()
	for w in warnings:
		print(f'[WARNING: {w}]')

	if args.operation == O_COMPL:
		complement = ComplementGatheredInfo(gathered_info)
		new_file_path = WriteComplementBackToGatheredInfo(complement)

	print(f'Result is saved to "{new_file_path}"')

	exit(0)
