Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[theme]
primaryColor="#00A86B"
backgroundColor="#0A0A0A"
secondaryBackgroundColor="#1A1A1A"
textColor="#FFFFFF"
font="sans serif"
326 changes: 326 additions & 0 deletions dashboard.py/dashboard_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,326 @@
import sys
from pathlib import Path
import plotly.express as px

sys.path.append(str(Path(__file__).resolve().parents[1]))

import streamlit as st
from www.services.etl.pipeline import openalex_pipeline
import pandas as pd

st.set_page_config(
page_title="Bibliometrix Dashboard",
layout="wide"
)

st.markdown("""
<style>

.stApp {
background-color: #0A0A0A;
}

h1 {
color: #D4AF37 !important;
}

h2, h3 {
color: #00A86B !important;
}

div[data-testid="metric-container"] {
background-color: #1A1A1A;
border: 1px solid #D4AF37;
padding: 15px;
border-radius: 12px;
}

</style>
""", unsafe_allow_html=True)

st.title("📚 Bibliometrix Dashboard")
st.write("OpenAlex ETL + Bibliometric Analysis")

colA, colB = st.columns([3, 1])

with colA:
query = st.text_input(
"Search Topic",
placeholder="🔍 Search topics like AI, Machine Learning, Data Science...",
label_visibility="collapsed"
)

with colB:
max_results = st.selectbox(
"Documents",
[50, 100, 200, 500],
index=1
)

if not query:
query = "machine learning"

df = openalex_pipeline(
query=query,
max_results=max_results
)
col1, col2, col3, col4 = st.columns(4)

with col1:
st.metric("Documents", len(df))

with col2:
st.metric(
"Authors",
df["AU"].explode().dropna().nunique()
)

with col3:
st.metric(
"Keywords",
df["DE"].explode().dropna().nunique()
)

with col4:
st.metric(
"Total Citations",
int(df["TC"].fillna(0).sum())
)
# ==================================================
# DATASET PREVIEW
# ==================================================

st.markdown("""
<h2 style='color:#D4AF37;
font-size:42px;
font-weight:700;'>
📄 STANDARDIZED DATASET PREVIEW
</h2>
""", unsafe_allow_html=True)

st.dataframe(df, use_container_width=True)

# ==================================================
# PUBLICATIONS BY YEAR
# ==================================================

st.divider()

st.markdown("""
<h2 style='color:#D4AF37;
font-size:42px;
font-weight:700;'>
📊 PUBLICATIONS BY YEAR
</h2>
""", unsafe_allow_html=True)

year_counts = df["PY"].value_counts().sort_index()

fig = px.bar(
x=year_counts.index,
y=year_counts.values
)

fig.update_traces(
marker_color="#D4AF37"
)

fig.update_layout(
paper_bgcolor="#0A0A0A",
plot_bgcolor="#0A0A0A",
font_color="white",
xaxis_title="Year",
yaxis_title="Publications",
yaxis=dict(dtick=1),
showlegend=False
)

st.plotly_chart(fig, width="stretch")

# ==================================================
# TOP CITED PAPERS
# ==================================================

st.divider()

st.markdown("""
<h2 style='color:#D4AF37;
font-size:42px;
font-weight:700;'>
🏆 TOP 10 MOST CITED PAPERS
</h2>
""", unsafe_allow_html=True)

if "TC" in df.columns:

top_papers = (
df.sort_values("TC", ascending=False)
[["TI", "TC", "PY"]]
.head(10)
.reset_index(drop=True)
)

st.dataframe(
top_papers,
width="stretch"
)

st.write("Rows shown:", len(top_papers))
# ==================================================
# TOP AUTHORS
# ==================================================

st.divider()

st.markdown("""
<h2 style='color:#00C78C;
font-size:42px;
font-weight:700;'>
👥 TOP AUTHORS
</h2>
""", unsafe_allow_html=True)

top_authors = (
df["AU"]
.explode()
.dropna()
.value_counts()
.head(10)
.reset_index()
)

top_authors.columns = ["Author", "Publications"]

# Ranking Column
top_authors.insert(
0,
"Rank",
["🥇", "🥈", "🥉", "4", "5", "6", "7", "8", "9", "10"]
)

col1, col2 = st.columns([3, 1])

with col1:
st.dataframe(
top_authors,
use_container_width=True,
hide_index=True
)

with col2:
st.metric(
"TOP AUTHOR",
top_authors.iloc[0]["Author"]
)

st.metric(
"PUBLICATIONS",
int(top_authors.iloc[0]["Publications"])
)
# ==================================================
# TOP KEYWORDS
# ==================================================

st.divider()

st.markdown("""
<h2 style='color:#D4AF37;
font-size:42px;
font-weight:700;'>
🔑 TOP KEYWORDS
</h2>
""", unsafe_allow_html=True)

top_keywords = (
df["DE"]
.explode()
.dropna()
.value_counts()
.head(10)
.reset_index()
)

top_keywords.columns = ["Keyword", "Frequency"]

fig = px.bar(
top_keywords.sort_values("Frequency"),
x="Frequency",
y="Keyword",
orientation="h",
text="Frequency"
)

fig.update_traces(
marker_color="#D4AF37",
textposition="outside"
)

fig.update_layout(
paper_bgcolor="#0A0A0A",
plot_bgcolor="#0A0A0A",
font_color="white",
xaxis_title="Frequency",
yaxis_title="",
showlegend=False,
height=600,
margin=dict(l=20, r=20, t=20, b=20)
)

fig.update_xaxes(
nticks=8
)


st.plotly_chart(fig, use_container_width=True)
st.divider()

st.markdown("""
<h2 style='color:#D4AF37;font-size:42px;font-weight:700;'>
⬇ EXPORT RESULTS
</h2>
""", unsafe_allow_html=True)

csv = df.to_csv(index=False)

st.download_button(
label="📥 Download Dataset (CSV)",
data=csv,
file_name=f"{query}_bibliometric_data.csv",
mime="text/csv"
)

st.divider()

st.markdown("""
<div style="
text-align:center;
padding:30px;
font-size:15px;
line-height:1.8;
color:#CCCCCC;
">

<h3 style="color:#D4AF37;">
📚 Bibliometrix Dashboard
</h3>

<b style="color:#00C78C;">Developed by</b><br>
Madhumithra Balasubramanian<br>
Aya Soundous Hechaichi<br>
Alina Siddiqui

<br>

<b style="color:#00C78C;">Technologies Used</b><br>
Python • Streamlit • OpenAlex API • Bibliometrix Framework

<br>

<b style="color:#00C78C;">Hardware and Software for Big Data – Mod B</b><br>
University of Naples Federico II

<br>

<b style="color:#00C78C;">Professor:</b> Vincenzo Moscato<br>Data Science Course – Academic Year 2025/2026
</div>
""", unsafe_allow_html=True)
2 changes: 1 addition & 1 deletion functions/get_co_occurence_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ def field_by_year(df, field_cn, timespan=None, min_freq=2, n_items=5, remove_ter
The field to analyze ('ID', 'DE', 'TI', 'AB', 'WC')
"""
# Get the field data
M = df.get()
M = df

# Create co-occurrence matrix
A = cocMatrix(df, field_cn, binary=False, remove_terms=remove_terms, synonyms=synonyms)
Expand Down
1 change: 0 additions & 1 deletion functions/get_collaborationnetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def get_collaboration_network(
print("Generating collaboration network...")

M = df
m = df.get()
NetRefs = None
Title = ""

Expand Down
2 changes: 1 addition & 1 deletion functions/get_factorialanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_factorial_analysis(
# Set ngrams based on word_type
ngrams = int(ngram) if field in ['TI', 'AB'] else 1

M = df.get()
M = df
tab = table_tag(M, field, ngrams)

if len(tab) >= 2:
Expand Down
2 changes: 1 addition & 1 deletion functions/get_frequentwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None):
"""
Extract and count words from a specified field in the DataFrame.
"""
M = df.get()
M = df

# Remove duplicates
M = M.drop_duplicates(subset='SR')
Expand Down
2 changes: 1 addition & 1 deletion functions/get_localcitedauthors.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_local_cited_authors(df, num_of_cited_authors, fast_search=False):
loccit = 1

df = metaTagExtraction(df, "SR")
M = df.get()
M = df

# Fill missing values
M['TC'] = M['TC'].fillna(0)
Expand Down
2 changes: 1 addition & 1 deletion functions/get_localciteddocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_local_cited_documents(df, num_of_local_cited_docs, field_separator, fast
A Plotly figure object and a DataFrame of the most local cited documents.
"""
df = metaTagExtraction(df, "SR")
M = df.get()
M = df

# Determine the local citation threshold
if fast_search:
Expand Down
2 changes: 1 addition & 1 deletion functions/get_treemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None):
"""
Extract and count words from a specified field in the DataFrame.
"""
M = df.get()
M = df

# Remove duplicates
M = M.drop_duplicates(subset='SR')
Expand Down
Loading