pipeline.py

#!/usr/bin/env python3.8

"""
Central project pipeline, controlled by a config.ini file.
Calls functions for preprocessing and analysis.
"""

import configparser
import glob
import sys

import article_selection.article_selection as article_selection
from sentiment_analysis.inference import calulate_sentiment, eval_sentiment
from sentiment_analysis.word2vec_sentiment import *
from visualization.dash_plot import dash_plot
from visualization.wordcloud import generate_word_clouds

if __name__ == "__main__":

    # read config file
    try:
        config_file = sys.argv[1]
        config = configparser.ConfigParser()
        config.read(config_file)
    except IndexError:
        print("Error: Supply a config.ini file")
        quit()

    # ==============================
    # Selection of relevant Articles
    # ==============================
    if config.getboolean("ArticleSelection", "run_article_selection"):
        # create input filepath for article selection from:
        # the path to the folders and the start and end year
        base_path = config.get("ArticleSelection", "input_path_base")
        start_year = config.getint("ArticleSelection", "start_year")
        end_year = config.getint("ArticleSelection", "end_year")
        data_path_list = [base_path + str(year) + "/" for year in range(start_year, end_year + 1)]
        # create list of all data paths
        json_file_list = []
        for path in data_path_list:
            json_file_list += [file_path for file_path in glob.glob(path + "*.json")]

        # get keywords, output_path
        search_keywords = config.get("ArticleSelection", "search_words").lower().split(", ")
        output_base = config.get("ArticleSelection", "output_base")

        # create new file or append to existing file
        create_new_files = not config.getboolean("ArticleSelection", "append_to_existing_file")

        # if annotation files are needed get training_size and seed 
        use_annotation = config.getboolean("ArticleSelection", "use_annotation")
        training_size = config.getint("ArticleSelection", "training_size")
        seed = config.getint("ArticleSelection", "seed")

        # open all files containing an article 
        # check if the topic is relevannt
        # if use_anotation is True output is split in four files: 
        #   1. evaluation (size_all-training_size)
        #   2. 3 annotation files with the names of the annotators 1/3 training_size
        article_selection.write_relevant_content_to_file(json_file_list,
                                                         output_base,
                                                         search_keywords=search_keywords,
                                                         new=create_new_files,
                                                         training_size=training_size,
                                                         seed=seed,
                                                         annotation=use_annotation)

    # ===================
    # Word2Vec analysis
    # ===================
    if config.getboolean("Analysis", "run_w2v"):
        print("\nStart word2vec analysis")
        input_file = config.get("Analysis", "input_file")
        search_words = config.get("Analysis", "search_words_w2v").lower().split(",")
        base_output_path = config.get("Analysis", "output_base_w2v")
        start_year = config.getint("Analysis", "start_year")
        end_year = config.getint("Analysis", "end_year")
        number_most_sim = config.getint("Analysis", "number_most_sim")

        # all articles of the same year are one dataset
        if config.getboolean("Analysis", "run_by_year"):
            similarity_by_year(input_file, base_output_path, search_words,
                               start_year, end_year, number_most_sim)

        # all articles of the same publisher are one dataset
        if config.getboolean("Analysis", "run_by_publisher"):
            similarity_by_publisher(input_file, base_output_path, search_words,
                                    start_year, end_year, number_most_sim)

        # all articles of the same publisher during the same year are one dataset
        if config.getboolean("Analysis", "run_by_publisher_by_year"):
            similarity_by_year_and_publisher(input_file, base_output_path,
                                             search_words, start_year,
                                             end_year, number_most_sim)
        print("\nEnd word2vec analysis\n")

    # ==================
    # Sentiment analysis
    # ==================
    if config.getboolean("Analysis", "run_senti"):
        print("\nStart sentiment analysis")
        input_file = config.get("Analysis", "input_file")
        search_words = config.get("Analysis", "search_words").lower().split(",")
        output_file = config.get("Analysis", "output_senti")
        methods = config.get('Analysis', 'senti_methods').lower().split(", ")
        finetuned_sentibert_path = config.get('Analysis', 'finetuned_sentibert_path')

        # calculate the article sentiment
        # using one or multiple of the following methods:
        #   1. Sentiment-Dictionary (sentiws)
        #   2. Neuronal-Network based Bert model
        #      trained for gerneral sentiment analysis
        #   3. Same Bert model with additional training
        #      using labeled parts of news-articles about refugees
        calulate_sentiment(
            input_file,
            output_file,
            search_words,
            methods=methods,
            finetuned_sentibert_path=finetuned_sentibert_path
        )

    # =============================
    # Sentiment analysis evaluation
    # =============================
    if config.getboolean("Analysis", "run_senti_eval"):
        print("\nStart Evaluation")
        senti_eval_input = config.get("Analysis", "senti_eval_input")
        search_words = config.get("Analysis", "search_words").lower().split(",")
        senti_eval_output = config.get("Analysis", "senti_eval_output")
        methods = config.get('Analysis', 'senti_methods').lower().split(", ")
        finetuned_sentibert_path = config.get('Analysis', 'finetuned_sentibert_path')

        # Perform quantitative evaluation of sentiment analysis approaches
        # using one or multiple of the following methods:
        #   1. Sentiment-Dictionary (sentiws)
        #   2. Neuronal-Network based Bert model
        #      trained for gerneral sentiment analysis
        #   3. Same Bert model with additional training
        #      using labeled parts of news-articles about refugees
        eval_sentiment(
            senti_eval_input,
            senti_eval_output,
            search_words,
            methods=methods,
            finetuned_sentibert_path=finetuned_sentibert_path
        )

    # ==================
    # Plotting
    # ==================
    if config.getboolean("Plotting", "sentiment_plot"):
        input_file = config.get("Plotting", "input_file")
        dash_plot(input_file)

    # ==================
    # WordClouds
    # ==================
    if config.getboolean("WordClouds", "wordcloud_plot"):
        input_file = config.get("WordClouds", "input_file")
        output_path = config.get("WordClouds", "output_path")
        words = config.get("WordClouds", "words").lower().split(", ")
        column_values = config.get("WordClouds", "column_values").lower().split(", ")
        number_of_words_in_wordcloud = config.getint("WordClouds", "number_of_words_in_wordcloud")

        generate_word_clouds(input_file, words, column_values, output_path, number_of_words_in_wordcloud)