comp10002-project01/program.c

/* Emoticon message cleanser:
 *
 * Skeleton code written by Farhana Choudhury and Jianzhong Qi, April 2020
 *
 * Authorship Declaration:
 * (1) I certify that the program contained in this submission is completely
 * my own individual work, except where explicitly noted by comments that
 * provide details otherwise.  I understand that work that has been developed
 * by another student, or by me in collaboration with other students,
 * or by non-students as a result of request, solicitation, or payment,
 * may not be submitted for assessment in this subject.  I understand that
 * submitting for assessment work developed by or in collaboration with
 * other students or non-students constitutes Academic Misconduct, and
 * may be penalized by mark deductions, or by other penalties determined
 * via the University of Melbourne Academic Honesty Policy, as described
 * at https://academicintegrity.unimelb.edu.au.
 *
 * (2) I also certify that I have not provided a copy of this work in either
 * softcopy or hardcopy or any other form to any other student, and nor will
 * I do so until after the marks are released. I understand that providing
 * my work to other students, regardless of my intention or any undertakings
 * made to me by that other student, is also Academic Misconduct.
 *
 * (3) I further understand that providing a copy of the assignment
 * specification to any form of code authoring or assignment tutoring
 * service, or drawing the attention of others to such services and code
 * that may have been made available via such a service, may be regarded
 * as Student General Misconduct (interfering with the teaching activities
 * of the University and/or inciting others to commit Academic Misconduct).
 * I understand that an allegation of Student General Misconduct may arise
 * regardless of whether or not I personally make use of such solutions
 * or sought benefit from such actions.
 *
 * Signed by: Rory Healy 964275
 * Dated:     9th April 2020
 */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#define STAGE_NUM_ONE 1							/* stage numbers */
#define STAGE_NUM_TWO 2
#define STAGE_NUM_THREE 3
#define STAGE_NUM_FOUR 4
#define STAGE_NUM_FIVE 5
#define STAGE_HEADER "Stage %d\n==========\n"	/* stage header format string */

#define MAX_MSG_LENGTH 280						/* maximum message length */
#define MAX_NUM_MSGS 100						/* maximum number of messages */
#define MAX_EMTCN_LENGTH 50						/* maximum emot. line length */
#define MAX_NUM_EMTCN 50						/* maximum number of emot. */

typedef char msg_t[MAX_MSG_LENGTH+1];			/* a message */
typedef char emtcn_t[MAX_EMTCN_LENGTH+1];		/* an emoticon */

/****************************************************************/

/* function prototypes */
void read_one_msg(msg_t one_msg, int max_len);
void print_stage_header(int stage_num);
int count_tokens(msg_t one_msg);

void stage_one(msg_t one_msg);
void stage_two(msg_t msgs[], int *num_msgs);
void stage_three(msg_t msgs[], int num_msgs);
void stage_four(emtcn_t emtcns[], int *num_emtcns);
void stage_five(msg_t msgs[], int num_msgs, emtcn_t emtcns[], int num_emtcns);

/* add your own function prototypes here */
int is_seperating_line(msg_t one_msg);

void remove_leading_commas(msg_t one_msg, int msg_len);
void remove_trailing_commas(msg_t one_msg, int msg_len);
void remove_consecutive_commas(msg_t one_msg, int msg_len);
int length_consecutive_commas(msg_t one_msg, int msg_len);

void get_next_emtcn(msg_t one_msg, emtcn_t emtcn_msg, \
                    int *emtcn_len, int *msg_offset);
void remove_curr_emtcn(msg_t one_msg, int emtcn_len, int offset, int *msg_len);
int get_emtcn_len(msg_t one_msg, int msg_len, int msg_offset);
int is_emtcns_same(emtcn_t emtcn1, emtcn_t emtcn2, int emtcn_len);
int is_in_dict(emtcn_t curr_emtcn, int curr_emtcn_len, \
               emtcn_t emtcns[], int num_emtcns);

/****************************************************************/

/* main function controls all the action, do NOT modify this function */
int
main(int argc, char *argv[]) {
	/* to hold all input messages */
	msg_t msgs[MAX_NUM_MSGS];
	/* to hold the number of input messages */
	int num_msgs = 0;
	/* to hold all input emoticons in the dictionary */
	emtcn_t emtcns[MAX_NUM_EMTCN];
	/* to hold the number of emoticons in the dictionary */
	int num_emtcns = 0;

	/* stage 1: reading the first message */
	stage_one(msgs[num_msgs]);
	num_msgs++;

	/* stage 2: removing alphanumeric characters */
	stage_two(msgs, &num_msgs);

	/* stage 3: removing extra commas */
	stage_three(msgs, num_msgs);

	/* stage 4: reading the dictionary and finding the longest emoticon */
	stage_four(emtcns, &num_emtcns);

	/* stage 5: removing invalid emoticons with the help of the dictionary */
	stage_five(msgs, num_msgs, emtcns, num_emtcns);

	/* all done; take some rest */
	return 0;
}

/* read a line of input into one_msg */
void
read_one_msg(msg_t one_msg, int max_len) {
	int i = 0, c;
	while (((c = getchar()) != EOF) && (c != '\n') && (c != '\r')) {
		if (i < max_len) {
			one_msg[i++] = c;
		} else {
			printf("Invalid input line, toooooooo long.\n");
			exit(EXIT_FAILURE);
		}
	}
	one_msg[i] = '\0';
}

/* print stage header given stage number */
void
print_stage_header(int stage_num) {
	printf(STAGE_HEADER, stage_num);
}

/****************************************************************/
/* add your code below */

/* removes an emtcn from a msg given a length and offset */
void
remove_curr_emtcn(msg_t one_msg, int emtcn_len, int offset, int *msg_len) {
	msg_t msg_before, msg_after;
	int i, j;

    /* the message before the emtcn not in dict */
	for (i = 0; i < offset; i++) {
		msg_before[i] = one_msg[i];
	}
	msg_before[i] = '\0';

    /* the message after the emtcn not in dict */
	for (j = offset + emtcn_len; one_msg[j] != '\0'; j++) {
		msg_after[j - offset - emtcn_len] = one_msg[j];
	}
	msg_after[j - offset - emtcn_len] = '\0';

    /* replacing one_msg with msg_before and msg_after */
	one_msg[0] = 0;
	for (i = 0; i < *msg_len - emtcn_len; i++) {
		if (i < offset) {
			one_msg[i] = msg_before[i];
		} else {
			one_msg[i] = msg_after[i - offset];
		}
	}
	one_msg[i] = '\0';

	// here is where a bug occurs in test 1
	*msg_len -= emtcn_len;
}

/* finds the next emoticon in one_msg and copies it to emtcn_msg */
void
get_next_emtcn(msg_t one_msg, emtcn_t emtcn_msg, \
               int *emtcn_len, int *msg_offset) {
	int i;
	for (i = *msg_offset; one_msg[i] != '\0'; i++) {
		if (one_msg[i] == ',') {
			break;
		} else {
			emtcn_msg[i - *msg_offset] = one_msg[i];
		}
	}
	*emtcn_len = i - *msg_offset;
	emtcn_msg[*emtcn_len] = '\0';
	*msg_offset += *emtcn_len + 1; // add 1 to account for the comma
}

/* tests if two emtcns are identical */
int
is_emtcns_same(emtcn_t emtcn1, emtcn_t emtcn2, int emtcn_len) {
	for (int i = 0; i < emtcn_len; i++) {
		if (emtcn1[i] != emtcn2[i]) {
			return 0;
		}
	}
	return 1;
}

/* tests if an emoticon is in the dictionary emtcns[] */
int
is_in_dict(emtcn_t curr_emtcn, int curr_emtcn_len, \
           emtcn_t emtcns[], int num_emtcns) {
	/* the current emtcn from emtcns[] being compared */
	emtcn_t emtcn_from_dict;

	int i, j;
	for (i = 0; i < num_emtcns; i++) {
        /* extracts the emoticon from the dictionary */
		for (j = 0; emtcns[i][j] != ','; j++) {
			emtcn_from_dict[j] = emtcns[i][j];
		}
		emtcn_from_dict[j] = '\0';

        /* compares lengths first to avoid unnecessary function calls */
		if (curr_emtcn_len == j) {
			if (is_emtcns_same(emtcn_from_dict, curr_emtcn, curr_emtcn_len)) {
				return 1;
			}
		}
	}
	return 0;
}

/* removes leading commas from a message */
void
remove_leading_commas(msg_t one_msg, int msg_len) {
	/* counts the number of leading commas */
	int i = 0;
	while (one_msg[i] == ',') {
		i += 1;
	}

	/* counts the number of characters copied into new_msg */
	int j;

	/* the message without leading commas */
	msg_t new_msg;
	for (j = 0; j < msg_len; j++) {
		if (one_msg[j + i] == '\0') {
			break;
		} else {
			new_msg[j] = one_msg[j + i];
		}
	}
	new_msg[j] = '\0';

	/* clears one_msg, and replaces with new_msg */
	one_msg[0] = 0;
	strncpy(one_msg, new_msg, j);
	one_msg[j] = '\0';
}

/* removes trailing commas from a message */
void
remove_trailing_commas(msg_t one_msg, int msg_len) {
	/* counts the number of trailing commas */
	int i = 0;
	while (one_msg[msg_len - i - 1] == ',') {
		i += 1;
	}

	/* counts the number of characters copied into new_msg */
	int j;

	/* the message without trailing commas */
	msg_t new_msg;
	for (j = 0; j < msg_len - i; j++) {
		new_msg[j] = one_msg[j];
	}
	new_msg[j] = '\0';

	/* clears one_msg, and replaces with new_msg */
	one_msg[0] = 0;
	strncpy(one_msg, new_msg, j);
	one_msg[j] = '\0';
}

/* removes consecutive commas from within a message */
void remove_consecutive_commas(msg_t one_msg, int msg_len) {
	one_msg[msg_len] = '\0'; // safety net

	int max_commas_length = length_consecutive_commas(one_msg, msg_len);

	while (max_commas_length > 1) {
		int i = 0, j = 0;
		msg_t new_msg, msg_cat;

        /* find out where the consecutive commas start */
		for (i = 0; i < msg_len; i++) {
			if ((one_msg[i] == ',') && (one_msg[i + 1] == ',')) {
				break;
			}
		}
		/* add all characters including one comma to new_msg */
		strncpy(new_msg, one_msg, i);
		new_msg[i] = '\0';

        /* find out how long the rest of the message is */
		for (j = 0; j < msg_len; j++) {
			if (one_msg[j + i + max_commas_length - 1] == '\0') {
				break;
			} else {
				msg_cat[j] = one_msg[j + i + max_commas_length - 1];
			}
		}
		msg_cat[j] = '\0';

        /* and add the rest of the message to new_msg (exluding the extra
         * commas
         */

		strncat(new_msg, msg_cat, j);
		new_msg[i + j] = '\0';

        /* clear one_msg and copy the edited message to it */
		one_msg[0] = 0;
		strncpy(one_msg, new_msg, i + j);
		one_msg[i + j] = '\0';
		msg_len = i + j;

		/* Loop guard */
		max_commas_length = length_consecutive_commas(one_msg, msg_len);
	}

}

/* finds the length of the first non-one length of commas */
int length_consecutive_commas(msg_t one_msg, int msg_len) {
	int i, counter = 0;
	for (i = 0; i < msg_len; i++) {
		if (one_msg[i] == ',') {
			if (one_msg[i + 1] == ',') {
				counter += 1;
				continue;
			} else {
				if (counter) {
					break;
				}
			}
		}
	}
	return counter + 1;
}

/* checks if the current message is a seperating line */
int
is_seperating_line(msg_t one_msg) {
	msg_t seperating_line = "###";
	if (!strncmp(one_msg, seperating_line, strlen(seperating_line))) {
		return 1;
	}
	return 0;
}

/* scan a message and count the number of tokens in it */
int
count_tokens(msg_t one_msg) {
	/* this counts the number of commas, which is analogous
	to the number of tokens */
	int num_tokens = 1;

	for (int i = 0; i < (int)strlen(one_msg); i++) {
		if (one_msg[i] == ',') {
			num_tokens += 1;
		}
	}

	return num_tokens;
}

/****************************************************************/
/* stages 1 - 5 */

/* stage 1: reading the first message */
void
stage_one(msg_t one_msg) {
	/* print stage header */
	print_stage_header(STAGE_NUM_ONE);

	/* read the first message */
	read_one_msg(one_msg, MAX_MSG_LENGTH);

	/* count and print the number of tokens */
	printf("Number of tokens: %d\n\n", count_tokens(one_msg));
}

/* stage 2: removing alphanumeric characters */
void
stage_two(msg_t msgs[], int *num_msgs) {
	print_stage_header(STAGE_NUM_TWO);

	/* *num_msgs will be used as an index, hence why it is zeroed */
    *num_msgs = 0;

	/* the current message being analysed */
	msg_t curr_msg;

	/* as msgs[0] already has a message in it, copy that to curr_msg
	 * and use that in the loop first, then read more messages
	 */
    strncpy(curr_msg, msgs[0], strlen(msgs[0]));
	curr_msg[strlen(msgs[0])] = '\0';

    do {
        if (is_seperating_line(curr_msg)) {
			break;
		}

        /* the message without alphanumeric characters */
		msg_t new_msg;

		/* the current character length of new_msg */
		int len = 0;

		for (int j = 0; j <= (int)strlen(curr_msg); j++) {
			if (!isalnum(curr_msg[j])) {
				new_msg[len++] = curr_msg[j];
			}
		}
		new_msg[len] = '\0';

		/* Copy the edited message to msgs[] */
		strncpy(msgs[*num_msgs], new_msg, len);
		msgs[*num_msgs][len] = '\0';

        /* prepare for the next message to be read */
        read_one_msg(curr_msg, MAX_MSG_LENGTH);
		*num_msgs += 1;
    } while (*num_msgs <= MAX_NUM_MSGS);

	for (int i = 0; i < *num_msgs; i++) {
		printf("%s\n", msgs[i]);
	}
	printf("\n");
}

/* stage 3: removing extra commas */
void
stage_three(msg_t msgs[], int num_msgs) {
	print_stage_header(STAGE_NUM_THREE);
	int i, curr_msg_len;
	msg_t curr_msg;

	for (i = 0; i < num_msgs; i++) {
		/* copy the current message stored in msgs[i] to curr_msg */
		curr_msg_len = strlen(msgs[i]);
		strncpy(curr_msg, msgs[i], curr_msg_len);
		curr_msg[curr_msg_len] = '\0';

		/* remove extra commas from curr_msg */
		remove_leading_commas(curr_msg, curr_msg_len);
		curr_msg_len = strlen(curr_msg);
		remove_trailing_commas(curr_msg, curr_msg_len);
		curr_msg_len = strlen(curr_msg);
		remove_consecutive_commas(curr_msg, curr_msg_len);
		curr_msg_len = strlen(curr_msg);

		/* Clear the current message stored in msgs[i] and replace
		 * with the edited message stored in curr_msg
		 */
		msgs[i][0] = 0;
		strncpy(msgs[i], curr_msg, strlen(curr_msg));
		msgs[i][curr_msg_len] = '\0';
	}

	for (int j = 0; j < num_msgs; j++) {
		printf("%s\n", msgs[j]);
	}
	printf("\n");
}

/* stage 4: reading the dictionary and finding the longest emoticon */
void
stage_four(emtcn_t emtcns[], int *num_emtcns) {
	print_stage_header(STAGE_NUM_FOUR);

	/* the emoticon to be read */
	emtcn_t new_emtcn;

	while (*num_emtcns <= MAX_NUM_EMTCN) {
		read_one_msg(new_emtcn, MAX_EMTCN_LENGTH);
		/* break as soon as the line is empty */
		if ((int)strlen(new_emtcn) == 0) {
			break;
		}
		strncpy(emtcns[*num_emtcns], new_emtcn, strlen(new_emtcn));
		emtcns[*num_emtcns][(int)strlen(new_emtcn)] = '\0';
		*num_emtcns += 1;
	}
	emtcn_t max_emtcn;
	int max_emtcn_len = 0, curr_len, i, j;
	for (i = 0; i < *num_emtcns; i++) {
        /* get the length of emoticon, and compare it to the max length */
		for (j = 0; j < (int)strlen(emtcns[i]); j++) {
			if (emtcns[i][j] == ',') {
				break;
			}
		}
		curr_len = j;

		if (curr_len > max_emtcn_len) {
			max_emtcn_len = curr_len;
			strncpy(max_emtcn, emtcns[i], j);
			max_emtcn[j] = '\0';
		}
	}
	printf("Emoticon total: %d\nLongest: %s\nLength: %d\n",
	*num_emtcns, max_emtcn, max_emtcn_len);
	printf("\n");
}

/* stage 5: removing invalid emoticons with the help of the dictionary */
void
stage_five(msg_t msgs[], int num_msgs, emtcn_t emtcns[], int num_emtcns) {
	print_stage_header(STAGE_NUM_FIVE);
	int i, j;

	/* the length of current message from msgs[] */
	int curr_msg_len = 0;

	/* the current emtcn from msgs[] */
	emtcn_t curr_emtcn;
	int curr_emtcn_len = 0;

	/* the index of the curr_emtcn from msgs[] */
	int curr_offset = 0;

	for (i = 0; i < num_msgs; i++) {
		for (j = 0; msgs[i][j] != '\0'; j++) {
			/* do nothing, count characters before \0 */
		}
		curr_msg_len = j;

		while (curr_offset < curr_msg_len) {
            /* get the next emoticon, check if it's in emtcns, and remove it
             * from msgs[i] if its not in emtcns
             */
			get_next_emtcn(msgs[i], curr_emtcn, &curr_emtcn_len, &curr_offset);

			if (!is_in_dict(curr_emtcn, curr_emtcn_len, emtcns, num_emtcns)) {
				/* this is much easier to do with strncat, but can't be used
                 * here due to assignment restrictions
                 */
                remove_curr_emtcn(msgs[i], curr_emtcn_len, \
                                 (curr_offset - curr_emtcn_len - 1), \
                                 &curr_msg_len);
            }
		}
		curr_msg_len = 0;
		curr_offset = 0;
	}

	for (i = 0; i < num_msgs; i++) {
		if (msgs[i][0] == '\0') {
			continue;
		}
		printf("%s\n", msgs[i]);
	}
}

/*
 *             _                          _   _     _
 *     /\     | |                        (_) | |   | |
 *    /  \    | |   __ _    ___    _ __   _  | |_  | |__    _ __ ___    ___
 *   / /\ \   | |  / _` |  / _ \  | '__| | | | __| | '_ \  | '_ ` _ \  / __|
 *  / ____ \  | | | (_| | | (_) | | |    | | | |_  | | | | | | | | | | \__ \
 * /_/    \_\ |_|  \__, |  \___/  |_|    |_|  \__| |_| |_| |_| |_| |_| |___/
 *                  __/ |
 *                 |___/
 *                             ______                   _
 *     /\                     |  ____|                 | |
 *    /  \     _ __    ___    | |__     _   _   _ __   | |
 *   / /\ \   | '__|  / _ \   |  __|   | | | | | '_ \  | |
 *  / ____ \  | |    |  __/   | |      | |_| | | | | | |_|
 * /_/    \_\ |_|     \___|   |_|       \__,_| |_| |_| (_)
 *
 */